From b05f2a79067fed5484d70b4bd148e80249716e72 Mon Sep 17 00:00:00 2001 From: Tom Callaway Date: Fri, 15 Jan 2016 15:23:10 -0500 Subject: [PATCH 1/4] Requires: redhat-rpm-config on hardened systems (all Fedora and RHEL 7+) --- R.spec | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/R.spec b/R.spec index bc1bcd0..1f91f5a 100644 --- a/R.spec +++ b/R.spec @@ -7,6 +7,9 @@ # Assume not modern. Override if needed. %global modern 0 +# Track if we're hardening (all current fedora and RHEL 7+) +%global hardening 0 + %global with_lto 0 %global with_java_headless 0 @@ -26,10 +29,12 @@ %global system_tre 1 # %%global with_lto 1 %global with_java_headless 1 +%global hardening 1 %endif %if 0%{?fedora} %global modern 1 +%global hardening 1 %endif %if 0%{?rhel} >= 6 @@ -51,7 +56,7 @@ Name: R Version: 3.2.3 -Release: 1%{?dist} +Release: 2%{?dist} Summary: A language for data analysis and graphics URL: http://www.r-project.org Source0: ftp://cran.r-project.org/pub/R/src/base/R-3/R-%{version}.tar.gz @@ -147,6 +152,10 @@ and called at run time. Summary: The minimal R components necessary for a functional runtime Group: Applications/Engineering Requires: xdg-utils, cups +# R inherits the compiler flags it was built with, hence we need this on hardened systems +%if 0%{hardening} +Requires: redhat-rpm-config +%endif %if %{modern} Requires: tex(dvips), vi %else @@ -961,6 +970,9 @@ R CMD javareconf \ %postun -n libRmath -p /sbin/ldconfig %changelog +* Fri Jan 15 2016 Tom Callaway - 3.2.3-2 +- Requires: redhat-rpm-config on hardened systems (all Fedora and RHEL 7+) + * Fri Dec 11 2015 Tom Callaway - 3.2.3-1 - update to 3.2.3 From bf16df588a2497ce27141599a3a06a84da1d4896 Mon Sep 17 00:00:00 2001 From: Tom Callaway Date: Fri, 15 Jan 2016 15:24:56 -0500 Subject: [PATCH 2/4] fix bogus date, macro in comment warnings --- R.spec | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R.spec b/R.spec index 1f91f5a..37aeaed 100644 --- a/R.spec +++ b/R.spec @@ -21,7 +21,7 @@ %endif # Using lto breaks debuginfo. -# %%if 0%{?fedora} >= 19 +# %%if 0%%{?fedora} >= 19 # %%global with_lto 1 # %%endif @@ -846,7 +846,7 @@ make check %{_libdir}/R/library/utils/ %{_libdir}/R/modules %{_libdir}/R/COPYING -# %{_libdir}/R/NEWS* +# %%{_libdir}/R/NEWS* %{_libdir}/R/SVN-REVISION /usr/lib/rpm/R-make-search-index.sh %{_infodir}/R-*.info* @@ -860,7 +860,7 @@ make check %defattr(-, root, root, -) %{_libdir}/pkgconfig/libR.pc %{_includedir}/R -# Symlink to %{_includedir}/R/ +# Symlink to %%{_includedir}/R/ %{_libdir}/R/include %files devel @@ -979,7 +979,7 @@ R CMD javareconf \ * Wed Oct 28 2015 David Tardon - 3.2.2-3 - rebuild for ICU 56.1 -* Fri Oct 13 2015 Tom Callaway - 3.2.2-2 +* Tue Oct 13 2015 Tom Callaway - 3.2.2-2 - apply patches from upstream bug 16497 to fix X11 hangs * Fri Aug 14 2015 Tom Callaway - 3.2.2-1 From 61ab120199dc081cfc08ffcecc6431e9c2177c31 Mon Sep 17 00:00:00 2001 From: Tom Callaway Date: Tue, 26 Jan 2016 13:45:09 -0500 Subject: [PATCH 3/4] use global instead of define --- R.spec | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R.spec b/R.spec index 37aeaed..1782c3e 100644 --- a/R.spec +++ b/R.spec @@ -1,7 +1,7 @@ %ifarch x86_64 -%define java_arch amd64 +%global java_arch amd64 %else -%define java_arch %{_arch} +%global java_arch %{_arch} %endif # Assume not modern. Override if needed. @@ -56,7 +56,7 @@ Name: R Version: 3.2.3 -Release: 2%{?dist} +Release: 3%{?dist} Summary: A language for data analysis and graphics URL: http://www.r-project.org Source0: ftp://cran.r-project.org/pub/R/src/base/R-3/R-%{version}.tar.gz @@ -345,7 +345,7 @@ cat < %{name}-prov %{__perl_provides} \ | grep -v 'File::Copy::Recursive' | grep -v 'Text::DelimMatch' EOF -%define __perl_provides %{_builddir}/R-%{version}/%{name}-prov +%global __perl_provides %{_builddir}/R-%{version}/%{name}-prov chmod +x %{__perl_provides} # Filter unwanted Requires: @@ -354,7 +354,7 @@ cat << \EOF > %{name}-req %{__perl_requires} \ | grep -v 'perl(Text::DelimMatch)' EOF -%define __perl_requires %{_builddir}/R-%{version}/%{name}-req +%global __perl_requires %{_builddir}/R-%{version}/%{name}-req chmod +x %{__perl_requires} %build @@ -970,6 +970,9 @@ R CMD javareconf \ %postun -n libRmath -p /sbin/ldconfig %changelog +* Tue Jan 26 2016 Tom Callaway - 3.2.3-3 +- use global instead of define + * Fri Jan 15 2016 Tom Callaway - 3.2.3-2 - Requires: redhat-rpm-config on hardened systems (all Fedora and RHEL 7+) From 7890c8ff08b927b72dac2b604494bd5d14c9e86f Mon Sep 17 00:00:00 2001 From: Tom Callaway Date: Tue, 26 Jan 2016 17:00:42 -0500 Subject: [PATCH 4/4] if texi2any is set to 0, then copy in prebuilt html manuals (RHEL 5 & 6 only) --- R-FAQ.html | 4225 +++++++++++++ R-admin.html | 7337 +++++++++++++++++++++++ R-data.html | 3190 ++++++++++ R-exts.html | 15950 +++++++++++++++++++++++++++++++++++++++++++++++++ R-intro.html | 9601 +++++++++++++++++++++++++++++ R-ints.html | 5791 ++++++++++++++++++ R-lang.html | 6228 +++++++++++++++++++ R.spec | 26 +- 8 files changed, 52347 insertions(+), 1 deletion(-) create mode 100644 R-FAQ.html create mode 100644 R-admin.html create mode 100644 R-data.html create mode 100644 R-exts.html create mode 100644 R-intro.html create mode 100644 R-ints.html create mode 100644 R-lang.html diff --git a/R-FAQ.html b/R-FAQ.html new file mode 100644 index 0000000..92654d7 --- /dev/null +++ b/R-FAQ.html @@ -0,0 +1,4225 @@ + + + + +R FAQ + + + + + + + + + + + + + + + +

R FAQ

+ + + + + + + + + + + + + +

Frequently Asked Questions on R

+

Version 3.2.2015-12-04

+

Kurt Hornik

+
+ +

Table of Contents

+ +
+ + +
+ + + +
+

+Next:   [Contents]

+
+ +

R FAQ

+ + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

1 Introduction

+ +

This document contains answers to some of the most frequently asked +questions about R. +

+ + + + + + + + +
+ + + +

1.1 Legalese

+ +

This document is copyright © 1998–2015 by Kurt +Hornik. +

+

This document is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published +by the Free Software Foundation; either version 2, or (at your option) +any later version. +

+

This document is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +

+

Copies of the GNU General Public License versions are +available at +

+ + + +
+ +
+

+Next: , Previous: , Up: Introduction   [Contents]

+
+ +

1.2 Obtaining this document

+ +

The latest version of this document is always available from +

+ + +

From there, you can obtain versions converted to +plain +ASCII text, +GNU +info, HTML, +PDF, +as well as the Texinfo source used for creating all these formats using the +GNU Texinfo system. +

+

You can also obtain the R FAQ from the doc/FAQ +subdirectory of a CRAN site (see What is CRAN?). +

+
+ +
+

+Next: , Previous: , Up: Introduction   [Contents]

+
+ +

1.3 Citing this document

+ +

In publications, please refer to this FAQ as Hornik +(2015), “The R FAQ”, and give the above, +official URL: +

+
+
@Misc{,
+  author        = {Kurt Hornik},
+  title         = {{R} {FAQ}},
+  year          = {2015},
+  url           = {https://CRAN.R-project.org/doc/FAQ/R-FAQ.html}
+}
+
+ + +
+ +
+

+Next: , Previous: , Up: Introduction   [Contents]

+
+ +

1.4 Notation

+ +

Everything should be pretty standard. ‘R>’ is used for the R +prompt, and a ‘$’ for the shell prompt (where applicable). +

+
+ +
+

+Previous: , Up: Introduction   [Contents]

+
+ +

1.5 Feedback

+ +

Feedback via email to Kurt.Hornik@R-project.org is of course +most welcome. +

+

In particular, note that I do not have access to Windows or Mac +systems. Features specific to the Windows and OS X ports of R are +described in the +“R for +Windows FAQ and the +“R for Mac +OS X FAQ. If you have information on Mac or Windows +systems that you think should be added to this document, please let me +know. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

2 R Basics

+ + + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: R Basics   [Contents]

+
+ +

2.1 What is R?

+ +

R is a system for statistical computation and graphics. It consists of +a language plus a run-time environment with graphics, a debugger, access +to certain system functions, and the ability to run programs stored in +script files. +

+

The design of R has been heavily influenced by two existing languages: +Becker, Chambers & Wilks’ S (see What is S?) and Sussman’s +Scheme. +Whereas the resulting language is very similar in appearance to S, the +underlying implementation and semantics are derived from Scheme. +See What are the differences between R and S?, for further details. +

+

The core of R is an interpreted computer language which allows branching +and looping as well as modular programming using functions. Most of the +user-visible functions in R are written in R. It is possible for the +user to interface to procedures written in the C, C++, or FORTRAN +languages for efficiency. The R distribution contains functionality for +a large number of statistical procedures. Among these are: linear and +generalized linear models, nonlinear regression models, time series +analysis, classical parametric and nonparametric tests, clustering and +smoothing. There is also a large set of functions which provide a +flexible graphical environment for creating various kinds of data +presentations. Additional modules (“add-on packages”) are available +for a variety of specific purposes (see R Add-On Packages). +

+

R was initially written by Ross Ihaka +and Robert Gentleman at the +Department of Statistics of the University of Auckland in Auckland, New +Zealand. In addition, a large group of individuals has contributed to R +by sending code and bug reports. +

+

Since mid-1997 there has been a core group (the “R Core Team”) who can +modify the R source code archive. The group currently consists of Doug +Bates, John Chambers, Peter Dalgaard, Seth Falcon, Robert Gentleman, +Kurt Hornik, Stefano Iacus, Ross Ihaka, Friedrich Leisch, Uwe Ligges, +Thomas Lumley, Martin Maechler, Duncan Murdoch, Paul Murrell, Martyn +Plummer, Brian Ripley, Deepayan Sarkar, Duncan Temple Lang, Luke +Tierney, and Simon Urbanek. +

+

R has a home page at https://www.R-project.org/. It is +free software +distributed under a GNU-style +copyleft, and an +official part of the GNU project +(“GNU S”). +

+
+ + + +

2.2 What machines does R run on?

+ +

R is being developed for the Unix-like, Windows and Mac families of +operating systems. Support for Mac OS Classic ended with R 1.7.1. +

+

The current version of R will configure and build under a number of +common Unix-like (e.g., https://en.wikipedia.org/wiki/Unix-like) +platforms including cpu-linux-gnu for the i386, amd64, alpha, +arm/armel, hppa, ia64, m68k, mips/mipsel, powerpc, s390 and sparc +CPUs (e.g., https://buildd.debian.org/build.php?&pkg=r-base), +i386-hurd-gnu, cpu-kfreebsd-gnu for i386 and amd64, +powerpc-apple-darwin, mips-sgi-irix, +i386-freebsd, +rs6000-ibm-aix, and sparc-sun-solaris. +

+ +

If you know about other platforms, please drop us a note. +

+
+ + + +

2.3 What is the current version of R?

+ +

The current released version is 3.2.3. Based on this +‘major.minor.patchlevel’ numbering scheme, there are two development +versions of R, a patched version of the current release (‘r-patched’) +and one working towards the next minor or eventually major (‘r-devel’) +releases of R, respectively. Version r-patched is for bug fixes mostly. +New features are typically introduced in r-devel. +

+
+ + + +

2.4 How can R be obtained?

+ +

Sources, binaries and documentation for R can be obtained via CRAN, +the “Comprehensive R Archive Network” (see What is CRAN?). +

+

Sources are also available via https://svn.R-project.org/R/, the +R Subversion repository, but currently not via anonymous rsync (nor +CVS). +

+

Tarballs with daily snapshots of the r-devel and r-patched development +versions of R can be found at +https://stat.ethz.ch/R/daily. +

+ + + + + +
+ + + +

2.5 How can R be installed?

+ + + + + + + +
+ + + +

2.5.1 How can R be installed (Unix-like)

+ +

If R is already installed, it can be started by typing R at the +shell prompt (of course, provided that the executable is in your path). +

+

If binaries are available for your platform (see Are there Unix-like binaries for R?), you can use these, following the +instructions that come with them. +

+

Otherwise, you can compile and install R yourself, which can be done +very easily under a number of common Unix-like platforms (see What machines does R run on?). The file INSTALL that comes with the +R distribution contains a brief introduction, and the “R Installation +and Administration” guide (see What documentation exists for R?) +has full details. +

+

Note that you need a FORTRAN compiler or perhaps f2c in +addition to a C compiler to build R. +

+

In the simplest case, untar the R source code, change to the directory +thus created, and issue the following commands (at the shell prompt): +

+
+
$ ./configure
+$ make
+
+ +

If these commands execute successfully, the R binary and a shell script +front-end called R are created and copied to the bin +directory. You can copy the script to a place where users can invoke +it, for example to /usr/local/bin. In addition, plain text help +pages as well as HTML and LaTeX versions of the documentation are +built. +

+

Use make dvi to create DVI versions of the R manuals, such as +refman.dvi (an R object reference index) and R-exts.dvi, +the “R Extension Writers Guide”, in the doc/manual +subdirectory. These files can be previewed and printed using standard +programs such as xdvi and dvips. You can also use +make pdf to build PDF (Portable Document Format) version of the +manuals, and view these using e.g. Acrobat. Manuals written in the +GNU Texinfo system can also be converted to info files +suitable for reading online with Emacs or stand-alone GNU +Info; use make info to create these versions (note that this +requires Makeinfo version 4.5). +

+

Finally, use make check to find out whether your R system works +correctly. +

+

You can also perform a “system-wide” installation using make +install. By default, this will install to the following directories: +

+
+
${prefix}/bin
+

the front-end shell script +

+
${prefix}/man/man1
+

the man page +

+
${prefix}/lib/R
+

all the rest (libraries, on-line help system, …). This is the “R +Home Directory” (R_HOME) of the installed system. +

+
+ +

In the above, prefix is determined during configuration +(typically /usr/local) and can be set by running +configure with the option +

+
+
$ ./configure --prefix=/where/you/want/R/to/go
+
+ +

(E.g., the R executable will then be installed into +/where/you/want/R/to/go/bin.) +

+

To install DVI, info and PDF versions of the manuals, use make +install-dvi, make install-info and make install-pdf, +respectively. +

+
+ + + +

2.5.2 How can R be installed (Windows)

+ +

The bin/windows directory of a CRAN site contains binaries for +a base distribution and add-on packages from CRAN to run on Windows +XP and later (including 64-bit versions of Windows) on ix86 and x86_64 +chips. The Windows version of R was created by Robert Gentleman and +Guido Masarotto, and is now being developed and maintained by +Duncan Murdoch and +Brian D. Ripley. +

+

The same directory has links to snapshots of the r-patched and r-devel +versions of R. +

+

See the “R for Windows FAQ for more details. +

+
+ + + +

2.5.3 How can R be installed (Mac)

+ +

The bin/macosx directory of a CRAN site contains a standard +Apple installer package to run on OS X 10.6 (‘Snow Leopard’) and later. +Once downloaded and executed, the installer will install the current +release of R and an R.app OS X GUI. This port of R for OS X +is maintained by Simon Urbanek +(and previously by Stefano Iacus). The +“R for Mac +OS X FAQ has more details. +

+

Snapshots of the r-patched and r-devel versions of R are available as +Apple installer packages at https://r.research.att.com. +

+
+ + + +

2.6 Are there Unix-like binaries for R?

+ + +

The bin/linux directory of a CRAN site contains the following +packages. +

+
+ + + + + +
CPUVersionsProvider
Debiani386/amd64squeeze/wheezyJohannes Ranke
armelwheezyJohannes Ranke
Ubuntui386/amd64lucid/precise/trustyMichael Rutter
+
+ +

Debian packages, maintained by Dirk Eddelbuettel, have long been part of +the Debian distribution, and can be accessed through APT, the Debian +package maintenance tool. Use e.g. apt-get install r-base +r-recommended to install the R environment and recommended packages. +If you also want to build R packages from source, also run apt-get +install r-base-dev to obtain the additional tools required for this. +So-called “backports” of the current R packages for at least the +stable distribution of Debian are provided by Johannes Ranke, and +available from CRAN. See +https://CRAN.R-project.org/bin/linux/debian/index.html for details on R +Debian packages and installing the backports, which should also be +suitable for other Debian derivatives. Native backports for Ubuntu are +provided by Michael Rutter. +

+

R binaries for Fedora, maintained by Tom “Spot” Callaway, are provided +as part of the Fedora distribution and can be accessed through +yum, the RPM installer/updater. Note that the “Software” +application (gnome-software), which is the default GUI for +software installation in Fedora 20, cannot be used to install R. It is +therefore recommended to use the yum command line tool. +The Fedora R RPM is a “meta-package” which installs all the user and +developer components of R (available separately as R-core and +R-devel), as well as R-java, which ensures that R is +configured for use with Java. The R RPM also installs the standalone R +math library (libRmath and libRmath-devel), although this +is not necessary to use R. When a new version of R is released, there +may be a delay of up to 2 weeks until the Fedora RPM becomes publicly +available, as it must pass through the statutory Fedora review process. +RPMs for a selection of R packages are also provided by Fedora. The +Extra Packages for Enterprise Linux (EPEL) project +(https://fedoraproject.org/wiki/EPEL) provides ports of the Fedora +RPMs for RedHat Enterprise Linux and compatible distributions (e.g., +Centos, Scientific Linux, Oracle Linux). +

+

See https://CRAN.R-project.org/bin/linux/suse/README.html for +information about RPMs for openSUSE. +

+

No other binary distributions are currently publically available via +CRAN. +

+
+ + + +

2.7 What documentation exists for R?

+ +

Online documentation for most of the functions and variables in R +exists, and can be printed on-screen by typing help(name) +(or ?name) at the R prompt, where name is the name of +the topic help is sought for. (In the case of unary and binary +operators and control-flow special forms, the name may need to be be +quoted.) +

+

This documentation can also be made available as one reference manual +for on-line reading in HTML and PDF formats, and as hardcopy via +LaTeX, see How can R be installed?. An up-to-date HTML +version is always available for web browsing at +https://stat.ethz.ch/R-manual/. +

+

Printed copies of the R reference manual for some version(s) are +available from Network Theory Ltd, at +http://www.network-theory.co.uk/R/base/. For each set of manuals +sold, the publisher donates USD 10 to the R Foundation (see What is the R Foundation?). +

+

The R distribution also comes with the following manuals. +

+
    +
  • “An Introduction to R” (R-intro) +includes information on data types, programming elements, statistical +modeling and graphics. This document is based on the “Notes on +S-PLUS” by Bill Venables and David Smith. +
  • “Writing R Extensions” (R-exts) +currently describes the process of creating R add-on packages, writing R +documentation, R’s system and foreign language interfaces, and the R +API. +
  • “R Data Import/Export” (R-data) +is a guide to importing and exporting data to and from R. +
  • “The R Language Definition” (R-lang), +a first version of the “Kernighan & Ritchie of R”, explains +evaluation, parsing, object oriented programming, computing on the +language, and so forth. +
  • “R Installation and Administration” (R-admin). +
  • “R Internals” (R-ints) +is a guide to R’s internal structures. +(Added in R 2.4.0.) +
+ +

An annotated bibliography (BibTeX format) of R-related publications +can be found at +

+ + +

Books on R by R Core Team members include +

+
+

John M. Chambers (2008), “Software for Data Analysis: Programming with +R”. Springer, New York, ISBN 978-0-387-75935-7, +http://statweb.stanford.edu/~jmc4/Rbook/. +

+

Peter Dalgaard (2008), “Introductory Statistics with R”, 2nd edition. +Springer, ISBN 978-0-387-79053-4, +http://publicifsv.sund.ku.dk/~pd/ISwR.html. +

+

Robert Gentleman (2008), “R Programming for Bioinformatics”. Chapman +& Hall/CRC, Boca Raton, FL, ISBN 978-1-420-06367-7, +https://www.bioconductor.org/pub/RBioinf/. +

+

Stefano M. Iacus (2008), “Simulation and Inference for Stochastic +Differential Equations: With R Examples”. Springer, New York, ISBN +978-0-387-75838-1. +

+

Deepayan Sarkar (2007), “Lattice: Multivariate Data Visualization with +R”. Springer, New York, ISBN 978-0-387-75968-5. +

+

W. John Braun and Duncan J. Murdoch (2007), “A First Course in +Statistical Programming with R”. Cambridge University Press, +Cambridge, ISBN 978-0521872652. +

+

P. Murrell (2005), “R Graphics”, Chapman & Hall/CRC, ISBN: +1-584-88486-X, +https://www.stat.auckland.ac.nz/~paul/RGraphics/rgraphics.html. +

+

William N. Venables and Brian D. Ripley (2002), “Modern Applied +Statistics with S” (4th edition). Springer, ISBN 0-387-95457-0, +https://www.stats.ox.ac.uk/pub/MASS4/. +

+

Jose C. Pinheiro and Douglas M. Bates (2000), “Mixed-Effects Models in +S and S-Plus”. Springer, ISBN 0-387-98957-0. +

+ +

Last, but not least, Ross’ and Robert’s experience in designing and +implementing R is described in Ihaka & Gentleman (1996), “R: A Language +for Data Analysis and Graphics”, +Journal of +Computational and Graphical Statistics, 5, 299–314. +

+
+ + + +

2.8 Citing R

+ +

To cite R in publications, use +

+
+
@Manual{,
+  title        = {R: A Language and Environment for Statistical
+                  Computing},
+  author       = {{R Core Team}},
+  organization = {R Foundation for Statistical Computing},
+  address      = {Vienna, Austria},
+  year         = 2015,
+  url          = {https://www.R-project.org}
+}
+
+ +

Citation strings (or BibTeX entries) for R and R packages can also be +obtained by citation(). +

+
+ +
+

+Next: , Previous: , Up: R Basics   [Contents]

+
+ +

2.9 What mailing lists exist for R?

+ +

Thanks to Martin Maechler, there +are several mailing lists devoted to R, including the following: +

+
+
R-announce
+

A moderated list for major announcements about the development of R and +the availability of new code. +

+
R-packages
+

A moderated list for announcements on the availability of new or +enhanced contributed packages. +

+
R-help
+

The ‘main’ R mailing list, for discussion about problems and solutions +using R, announcements (not covered by ‘R-announce’ and ‘R-packages’) +about the development of R and the availability of new code. +

+
R-devel
+

This list is for questions and discussion about code development in R. +

+
R-package-devel
+

A list which which provides a forum for learning about the R package +development process. +

+
+ +

Please read the posting guide before sending anything to any mailing list. +

+

Note in particular that R-help is intended to be comprehensible to +people who want to use R to solve problems but who are not necessarily +interested in or knowledgeable about programming. Questions likely to +prompt discussion unintelligible to non-programmers (e.g., questions +involving C or C++) should go to R-devel. +

+

Convenient access to information on these lists, subscription, and +archives is provided by the web interface at +https://stat.ethz.ch/mailman/listinfo/. One can also subscribe +(or unsubscribe) via email, e.g. to R-help by sending ‘subscribe’ +(or ‘unsubscribe’) in the body of the message (not in the +subject!) to R-help-request@lists.R-project.org. +

+

Send email to R-help@lists.R-project.org to send a message to +everyone on the R-help mailing list. Subscription and posting to the +other lists is done analogously, with ‘R-help’ replaced by +‘R-announce’, ‘R-packages’, and ‘R-devel’, respectively. +Note that the R-announce and R-packages lists are gatewayed into R-help. +Hence, you should subscribe to either of them only in case you are not +subscribed to R-help. +

+

It is recommended that you send mail to R-help rather than only to the R +Core developers (who are also subscribed to the list, of course). This +may save them precious time they can use for constantly improving R, and +will typically also result in much quicker feedback for yourself. +

+

Of course, in the case of bug reports it would be very helpful to have +code which reliably reproduces the problem. Also, make sure that you +include information on the system and version of R being used. See +R Bugs for more details. +

+

See https://www.R-project.org/mail.html for more information on +the R mailing lists. +

+

The R Core Team can be reached at R-core@lists.R-project.org +for comments and reports. +

+

Many of the R project’s mailing lists are also available via +Gmane, from which they can be read with a web +browser, using an NNTP news reader, or via RSS feeds. See +http://dir.gmane.org/index.php?prefix=gmane.comp.lang.r. for +the available mailing lists, and http://www.gmane.org/rss.php for +details on RSS feeds. +

+
+ + + +

2.10 What is CRAN?

+ +

The “Comprehensive R Archive Network” (CRAN) is a collection of +sites which carry identical material, consisting of the R +distribution(s), the contributed extensions, documentation for R, and +binaries. +

+

The CRAN master site at WU (Wirtschaftsuniversität Wien) in Austria +can be found at the URL +

+
+

https://CRAN.R-project.org/ +

+ +

and is mirrored daily to many sites around the world. +See https://CRAN.R-project.org/mirrors.html for a complete list of +mirrors. Please use the CRAN site closest to you to reduce network +load. +

+

From CRAN, you can obtain the latest official release of R, daily +snapshots of R (copies of the current source trees), as gzipped and +bzipped tar files, a wealth of additional contributed code, as well as +prebuilt binaries for various operating systems (Linux, Mac OS Classic, +OS X, and MS Windows). CRAN also provides access to +documentation on R, existing mailing lists and the R Bug Tracking +system. +

+

Please always use the URL of the master site when referring to +CRAN. +

+
+ +
+

+Next: , Previous: , Up: R Basics   [Contents]

+
+ +

2.11 Can I use R for commercial purposes?

+ +

R is released under the +GNU General Public License (GPL), version 2. If +you have any questions regarding the legality of using R in any +particular situation you should bring it up with your legal counsel. We +are in no position to offer legal advice. +

+

It is the opinion of the R Core Team that one can use R for commercial +purposes (e.g., in business or in consulting). The GPL, like +all Open Source licenses, permits all and any use of the package. It +only restricts distribution of R or of other programs containing code +from R. This is made clear in clause 6 (“No Discrimination Against +Fields of Endeavor”) of the +Open Source +Definition: +

+
+

The license must not restrict anyone from making use of the program in a +specific field of endeavor. For example, it may not restrict the +program from being used in a business, or from being used for genetic +research. +

+ +

It is also explicitly stated in clause 0 of the GPL, which says in part +

+
+

Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of running +the Program is not restricted, and the output from the Program is +covered only if its contents constitute a work based on the Program. +

+ +

Most add-on packages, including all recommended ones, also explicitly +allow commercial use in this way. A few packages are restricted to +“non-commercial use”; you should contact the author to clarify whether +these may be used or seek the advice of your legal counsel. +

+

None of the discussion in this section constitutes legal advice. The R +Core Team does not provide legal advice under any circumstances. +

+
+ + + +

2.12 Why is R named R?

+ +

The name is partly based on the (first) names of the first two R authors +(Robert Gentleman and Ross Ihaka), and partly a play on the name of the +Bell Labs language ‘S’ (see What is S?). +

+ +
+ +
+

+Next: , Previous: , Up: R Basics   [Contents]

+
+ +

2.13 What is the R Foundation?

+ +

The R Foundation is a not for profit organization working in the public +interest. It was founded by the members of the R Core Team in order to +provide support for the R project and other innovations in statistical +computing, provide a reference point for individuals, institutions or +commercial enterprises that want to support or interact with the R +development community, and to hold and administer the copyright of R +software and documentation. See +https://www.R-project.org/foundation/ for more information. +

+
+ +
+

+Previous: , Up: R Basics   [Contents]

+
+ +

2.14 What is R-Forge?

+ +

R-Forge (https://R-Forge.R-project.org/) offers a central platform +for the development of R packages, R-related software and further +projects. It is based on GForge offering +easy access to the best in SVN, daily built and checked packages, +mailing lists, bug tracking, message boards/forums, site hosting, +permanent file archival, full backups, and total web-based +administration. For more information, see the R-Forge web page and +Stefan Theußl and Achim Zeileis (2009), “Collaborative software +development using R-Forge”, The R Journal, 1(1):9–14. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

3 R and S

+ + + + + + + + + +
+ +
+

+Next: , Previous: , Up: R and S   [Contents]

+
+ +

3.1 What is S?

+ +

S is a very high level language and an environment for data analysis and +graphics. In 1998, the Association for Computing Machinery +(ACM) presented its Software System Award to John M. Chambers, +the principal designer of S, for +

+
+

the S system, which has forever altered the way people analyze, +visualize, and manipulate data … +

+

S is an elegant, widely accepted, and enduring software system, with +conceptual integrity, thanks to the insight, taste, and effort of John +Chambers. +

+ +

The evolution of the S language is characterized by four books by John +Chambers and coauthors, which are also the primary references for S. +

+
    +
  • Richard A. Becker and John M. Chambers (1984), “S. An Interactive +Environment for Data Analysis and Graphics,” Monterey: Wadsworth and +Brooks/Cole. + +

    This is also referred to as the “Brown Book”, and of historical +interest only. +

    +
  • Richard A. Becker, John M. Chambers and Allan R. Wilks (1988), “The New +S Language,” London: Chapman & Hall. + +

    This book is often called the “Blue Book”, and introduced what +is now known as S version 2. +

    +
  • John M. Chambers and Trevor J. Hastie (1992), “Statistical Models in +S,” London: Chapman & Hall. + +

    This is also called the “White Book”, and introduced S version +3, which added structures to facilitate statistical modeling in S. +

    +
  • John M. Chambers (1998), “Programming with Data,” New York: Springer, +ISBN 0-387-98503-4 +(http://statweb.stanford.edu/~jmc4/Sbook/). + +

    This “Green Book” describes version 4 of S, a major revision of +S designed by John Chambers to improve its usefulness at every stage of +the programming process. +

+ +

See http://statweb.stanford.edu/~jmc4/papers/96.7.ps +for further information on the “Evolution of the S Language”. +

+

There is a huge amount of user-contributed code for S, available at the +S Repository at CMU. +

+ +
+ + + +

3.2 What is S-PLUS?

+ +

S-PLUS is a value-added version of S currently sold by +TIBCO Software Inc as ‘TIBCO Spotfire S+’. +See https://en.wikipedia.org/wiki/S-PLUS for more information. +

+
+ + + +

3.3 What are the differences between R and S?

+ +

We can regard S as a language with three current implementations or +“engines”, the “old S engine” (S version 3; S-PLUS 3.x and 4.x), +the “new S engine” (S version 4; S-PLUS 5.x and above), and R. +Given this understanding, asking for “the differences between R and S” +really amounts to asking for the specifics of the R implementation of +the S language, i.e., the difference between the R and S engines. +

+

For the remainder of this section, “S” refers to the S engines and not +the S language. +

+ + + + + + +
+ + + +

3.3.1 Lexical scoping

+ +

Contrary to other implementations of the S language, R has adopted an +evaluation model in which nested function definitions are lexically +scoped. This is analogous to the evaluation model in Scheme. +

+

This difference becomes manifest when free variables occur in a +function. Free variables are those which are neither formal parameters +(occurring in the argument list of the function) nor local variables +(created by assigning to them in the body of the function). In S, the +values of free variables are determined by a set of global variables +(similar to C, there is only local and global scope). In R, they are +determined by the environment in which the function was created. +

+

Consider the following function: +

+
+
cube <- function(n) {
+  sq <- function() n * n
+  n * sq()
+}
+
+ +

Under S, sq() does not “know” about the variable n +unless it is defined globally: +

+
+
S> cube(2)
+Error in sq():  Object "n" not found
+Dumped
+S> n <- 3
+S> cube(2)
+[1] 18
+
+ +

In R, the “environment” created when cube() was invoked is +also looked in: +

+
+
R> cube(2)
+[1] 8
+
+ + + + + + + +

As a more “interesting” real-world problem, suppose you want to write +a function which returns the density function of the r-th order +statistic from a sample of size n from a (continuous) +distribution. For simplicity, we shall use both the cdf and pdf of the +distribution as explicit arguments. (Example compiled from various +postings by Luke Tierney.) +

+

The S-PLUS documentation for call() basically suggests the +following: +

+
+
dorder <- function(n, r, pfun, dfun) {
+  f <- function(x) NULL
+  con <- round(exp(lgamma(n + 1) - lgamma(r) - lgamma(n - r + 1)))
+  PF <- call(substitute(pfun), as.name("x"))
+  DF <- call(substitute(dfun), as.name("x"))
+  f[[length(f)]] <-
+    call("*", con,
+         call("*", call("^", PF, r - 1),
+              call("*", call("^", call("-", 1, PF), n - r),
+                   DF)))
+  f
+}
+
+ +

Rather tricky, isn’t it? The code uses the fact that in S, +functions are just lists of special mode with the function body as the +last argument, and hence does not work in R (one could make the idea +work, though). +

+

A version which makes heavy use of substitute() and seems to work +under both S and R is +

+
+
dorder <- function(n, r, pfun, dfun) {
+  con <- round(exp(lgamma(n + 1) - lgamma(r) - lgamma(n - r + 1)))
+  eval(substitute(function(x) K * PF(x)^a * (1 - PF(x))^b * DF(x),
+                  list(PF = substitute(pfun), DF = substitute(dfun),
+                       a = r - 1, b = n - r, K = con)))
+}
+
+ +

(the eval() is not needed in S). +

+

However, in R there is a much easier solution: +

+
+
dorder <- function(n, r, pfun, dfun) {
+  con <- round(exp(lgamma(n + 1) - lgamma(r) - lgamma(n - r + 1)))
+  function(x) {
+    con * pfun(x)^(r - 1) * (1 - pfun(x))^(n - r) * dfun(x)
+  }
+}
+
+ +

This seems to be the “natural” implementation, and it works because +the free variables in the returned function can be looked up in the +defining environment (this is lexical scope). +

+

Note that what you really need is the function closure, i.e., the +body along with all variable bindings needed for evaluating it. Since +in the above version, the free variables in the value function are not +modified, you can actually use it in S as well if you abstract out the +closure operation into a function MC() (for “make closure”): +

+
+
dorder <- function(n, r, pfun, dfun) {
+  con <- round(exp(lgamma(n + 1) - lgamma(r) - lgamma(n - r + 1)))
+  MC(function(x) {
+       con * pfun(x)^(r - 1) * (1 - pfun(x))^(n - r) * dfun(x)
+     },
+     list(con = con, pfun = pfun, dfun = dfun, r = r, n = n))
+}
+
+ +

Given the appropriate definitions of the closure operator, this works in +both R and S, and is much “cleaner” than a substitute/eval solution +(or one which overrules the default scoping rules by using explicit +access to evaluation frames, as is of course possible in both R and S). +

+

For R, MC() simply is +

+
+
MC <- function(f, env) f
+
+ +

(lexical scope!), a version for S is +

+
+
MC <- function(f, env = NULL) {
+  env <- as.list(env)
+  if (mode(f) != "function")
+    stop(paste("not a function:", f))
+  if (length(env) > 0 && any(names(env) == ""))
+    stop(paste("not all arguments are named:", env))
+  fargs <- if(length(f) > 1) f[1:(length(f) - 1)] else NULL
+  fargs <- c(fargs, env)
+  if (any(duplicated(names(fargs))))
+    stop(paste("duplicated arguments:", paste(names(fargs)),
+         collapse = ", "))
+  fbody <- f[length(f)]
+  cf <- c(fargs, fbody)
+  mode(cf) <- "function"
+  return(cf)
+}
+
+ +

Similarly, most optimization (or zero-finding) routines need some +arguments to be optimized over and have other parameters that depend on +the data but are fixed with respect to optimization. With R scoping +rules, this is a trivial problem; simply make up the function with the +required definitions in the same environment and scoping takes care of +it. With S, one solution is to add an extra parameter to the function +and to the optimizer to pass in these extras, which however can only +work if the optimizer supports this. +

+

Nested lexically scoped functions allow using function closures and +maintaining local state. A simple example (taken from Abelson and +Sussman) is obtained by typing demo("scoping") at the R prompt. +Further information is provided in the standard R reference “R: A +Language for Data Analysis and Graphics” (see What documentation exists for R?) and in Robert Gentleman and Ross Ihaka (2000), “Lexical +Scope and Statistical Computing”, +Journal of +Computational and Graphical Statistics, 9, 491–508. +

+

Nested lexically scoped functions also imply a further major difference. +Whereas S stores all objects as separate files in a directory somewhere +(usually .Data under the current directory), R does not. All +objects in R are stored internally. When R is started up it grabs a +piece of memory and uses it to store the objects. R performs its own +memory management of this piece of memory, growing and shrinking its +size as needed. Having everything in memory is necessary because it is +not really possible to externally maintain all relevant “environments” +of symbol/value pairs. This difference also seems to make R +faster than S. +

+

The down side is that if R crashes you will lose all the work for the +current session. Saving and restoring the memory “images” (the +functions and data stored in R’s internal memory at any time) can be a +bit slow, especially if they are big. In S this does not happen, +because everything is saved in disk files and if you crash nothing is +likely to happen to them. (In fact, one might conjecture that the S +developers felt that the price of changing their approach to persistent +storage just to accommodate lexical scope was far too expensive.) +Hence, when doing important work, you might consider saving often (see +How can I save my workspace?) to safeguard against possible +crashes. Other possibilities are logging your sessions, or have your R +commands stored in text files which can be read in using +source(). +

+
+

Note: If you run R from within Emacs (see R and Emacs), you can save the +contents of the interaction buffer to a file and conveniently manipulate +it using ess-transcript-mode, as well as save source copies of +all functions and data used. +

+ +
+ + + +

3.3.2 Models

+ +

There are some differences in the modeling code, such as +

+
    +
  • Whereas in S, you would use lm(y ~ x^3) to regress y on +x^3, in R, you have to insulate powers of numeric vectors (using +I()), i.e., you have to use lm(y ~ I(x^3)). +
  • The glm family objects are implemented differently in R and S. The same +functionality is available but the components have different names. +
  • Option na.action is set to "na.omit" by default in R, +but not set in S. +
  • Terms objects are stored differently. In S a terms object is an +expression with attributes, in R it is a formula with attributes. The +attributes have the same names but are mostly stored differently. +
  • Finally, in R y ~ x + 0 is an alternative to y ~ x - 1 for +specifying a model with no intercept. Models with no parameters at all +can be specified by y ~ 0. +
+ +
+ + + +

3.3.3 Others

+ +

Apart from lexical scoping and its implications, R follows the S +language definition in the Blue and White Books as much as possible, and +hence really is an “implementation” of S. There are some intentional +differences where the behavior of S is considered “not clean”. In +general, the rationale is that R should help you detect programming +errors, while at the same time being as compatible as possible with S. +

+

Some known differences are the following. +

+
    +
  • In R, if x is a list, then x[i] <- NULL and x[[i]] +<- NULL remove the specified elements from x. The first of +these is incompatible with S, where it is a no-op. (Note that you can +set elements to NULL using x[i] <- list(NULL).) + + +
  • In S, the functions named .First and .Last in the +.Data directory can be used for customizing, as they are executed +at the very beginning and end of a session, respectively. + +

    In R, the startup mechanism is as follows. Unless --no-environ +was given on the command line, R searches for site and user files to +process for setting environment variables. Then, R searches for a +site-wide startup profile unless the command line option +--no-site-file was given. This code is loaded in package +base. Then, unless --no-init-file was given, R +searches for a user profile file, and sources it into the user +workspace. It then loads a saved image of the user workspace from +.RData in case there is one (unless --no-restore-data or +--no-restore were specified). Next, a function .First() +is run if found on the search path. Finally, function .First.sys +in the base package is run. When terminating an R session, by +default a function .Last is run if found on the search path, +followed by .Last.sys. If needed, the functions .First() +and .Last() should be defined in the appropriate startup +profiles. See the help pages for .First and .Last for +more details. +

    +
  • In R, T and F are just variables being set to TRUE +and FALSE, respectively, but are not reserved words as in S and +hence can be overwritten by the user. (This helps e.g. when you have +factors with levels "T" or "F".) Hence, when writing code +you should always use TRUE and FALSE. + +
  • In R, dyn.load() can only load shared objects, as created +for example by R CMD SHLIB. + +
  • In R, attach() currently only works for lists and data frames, +but not for directories. (In fact, attach() also works for R +data files created with save(), which is analogous to attaching +directories in S.) Also, you cannot attach at position 1. + +
  • Categories do not exist in R, and never will as they are deprecated now +in S. Use factors instead. + +
  • In R, For() loops are not necessary and hence not supported. + +
  • In R, assign() uses the argument envir= rather than +where= as in S. + +
  • The random number generators are different, and the seeds have different +length. + +
  • R passes integer objects to C as int * rather than long * +as in S. + +
  • R has no single precision storage mode. However, as of version 0.65.1, +there is a single precision interface to C/FORTRAN subroutines. + +
  • By default, ls() returns the names of the objects in the current +(under R) and global (under S) environment, respectively. For example, +given + +
    +
    x <- 1; fun <- function() {y <- 1; ls()}
    +
    + +

    then fun() returns "y" in R and "x" (together with +the rest of the global environment) in S. +

    +
  • R allows for zero-extent matrices (and arrays, i.e., some elements of +the dim attribute vector can be 0). This has been determined a +useful feature as it helps reducing the need for special-case tests for +empty subsets. For example, if x is a matrix, x[, FALSE] +is not NULL but a “matrix” with 0 columns. Hence, such objects +need to be tested for by checking whether their length() is zero +(which works in both R and S), and not using is.null(). + +
  • Named vectors are considered vectors in R but not in S (e.g., +is.vector(c(a = 1:3)) returns FALSE in S and TRUE +in R). + +
  • Data frames are not considered as matrices in R (i.e., if DF is a +data frame, then is.matrix(DF) returns FALSE in R and +TRUE in S). + +
  • R by default uses treatment contrasts in the unordered case, whereas S +uses the Helmert ones. This is a deliberate difference reflecting the +opinion that treatment contrasts are more natural. + +
  • In R, the argument of a replacement function which corresponds to the +right hand side must be named ‘value’. E.g., f(a) <- b is +evaluated as a <- "f<-"(a, value = b). S always takes the last +argument, irrespective of its name. + +
  • In S, substitute() searches for names for substitution in the +given expression in three places: the actual and the default arguments +of the matching call, and the local frame (in that order). R looks in +the local frame only, with the special rule to use a “promise” if a +variable is not evaluated. Since the local frame is initialized with +the actual arguments or the default expressions, this is usually +equivalent to S, until assignment takes place. + +
  • In S, the index variable in a for() loop is local to the inside +of the loop. In R it is local to the environment where the for() +statement is executed. + +
  • In S, tapply(simplify=TRUE) returns a vector where R returns a +one-dimensional array (which can have named dimnames). + +
  • In S(-PLUS) the C locale is used, whereas in R the current +operating system locale is used for determining which characters are +alphanumeric and how they are sorted. This affects the set of valid +names for R objects (for example accented chars may be allowed in R) and +ordering in sorts and comparisons (such as whether "aA" < "Bb" is +true or false). From version 1.2.0 the locale can be (re-)set in R by +the Sys.setlocale() function. + +
  • In S, missing(arg) remains TRUE if arg is +subsequently modified; in R it doesn’t. + +
  • From R version 1.3.0, data.frame strips I() when creating +(column) names. + +
  • In R, the string "NA" is not treated as a missing value in a +character variable. Use as.character(NA) to create a missing +character value. + +
  • R disallows repeated formal arguments in function calls. + +
  • In S, dump(), dput() and deparse() are essentially +different interfaces to the same code. In R from version 2.0.0, this is +only true if the same control argument is used, but by default it +is not. By default dump() tries to write code that will evaluate +to reproduce the object, whereas dput() and deparse() +default to options for producing deparsed code that is readable. + +
  • In R, indexing a vector, matrix, array or data frame with [ using +a character vector index looks only for exact matches (whereas [[ +and $ allow partial matches). In S, [ allows partial +matches. + +
  • S has a two-argument version of atan and no atan2. A call +in S such as atan(x1, x2) is equivalent to R’s atan2(x1, +x2). However, beware of named arguments since S’s atan(x = a, y += b) is equivalent to R’s atan2(y = a, x = b) with the meanings +of x and y interchanged. (R used to have undocumented +support for a two-argument atan with positional arguments, but +this has been withdrawn to avoid further confusion.) + +
  • Numeric constants with no fractional and exponent (i.e., only integer) +part are taken as integer in S-PLUS 6.x or later, but as double in R. + +
+ +

There are also differences which are not intentional, and result from +missing or incorrect code in R. The developers would appreciate hearing +about any deficiencies you may find (in a written report fully +documenting the difference as you see it). Of course, it would be +useful if you were to implement the change yourself and make sure it +works. +

+
+ + + +

3.4 Is there anything R can do that S-PLUS cannot?

+ +

Since almost anything you can do in R has source code that you could +port to S-PLUS with little effort there will never be much you can do +in R that you couldn’t do in S-PLUS if you wanted to. (Note that +using lexical scoping may simplify matters considerably, though.) +

+

R offers several graphics features that S-PLUS does not, such as finer +handling of line types, more convenient color handling (via palettes), +gamma correction for color, and, most importantly, mathematical +annotation in plot texts, via input expressions reminiscent of TeX +constructs. See the help page for plotmath, which features an +impressive on-line example. More details can be found in Paul Murrell +and Ross Ihaka (2000), “An Approach to Providing Mathematical +Annotation in Plots”, Journal of Computational and Graphical Statistics, 9, +582–599. +

+
+ + + +

3.5 What is R-plus?

+ +

For a very long time, there was no such thing. +

+

XLSolutions Corporation is +currently beta testing a commercially supported version of R named R+ +(read R plus). +

+

Revolution Analytics has +released +REvolution R, an enterprise-class statistical analysis system based on +R, suitable for deployment in professional, commercial and regulated +environments. +

+ +

See also +https://en.wikipedia.org/wiki/R_programming_language#Commercialized_versions_of_R +for pointers to commercialized versions of R. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

4 R Web Interfaces

+ +

Rweb is developed and maintained by +Jeff Banfield. The +Rweb Home Page provides access +to all three versions of Rweb—a simple text entry form that returns +output and graphs, a more sophisticated JavaScript version that provides +a multiple window environment, and a set of point and click modules that +are useful for introductory statistics courses and require no knowledge +of the R language. All of the Rweb versions can analyze Web accessible +datasets if a URL is provided. +

+

The paper “Rweb: Web-based Statistical Analysis”, providing a detailed +explanation of the different versions of Rweb and an overview of how +Rweb works, was published in the Journal of Statistical Software +(http://www.jstatsoft.org/v04/i01/). +

+

Ulf Bartel has developed +R-Online, a simple on-line programming environment for R which +intends to make the first steps in statistical programming with R +(especially with time series) as easy as possible. There is no need for +a local installation since the only requirement for the user is a +JavaScript capable browser. See http://www.osvisions.com/r_online/ +for more information. +

+

Rcgi is a CGI WWW interface to R by MJ Ray. It had the ability to use “embedded code”: you could mix +user input and code, allowing the HTML author to do anything from +load in data sets to enter most of the commands for users without +writing CGI scripts. Graphical output was possible in PostScript or GIF +formats and the executed code was presented to the user for revision. +However, it is not clear if the project is still active. +

+

There are many additional examples of web interfaces to R which +basically allow to submit R code to a remote server, see for example the +collection of links available from +http://biostat.mc.vanderbilt.edu/twiki/bin/view/Main/StatCompCourse. +

+

David Firth has written +CGIwithR, an R add-on package available from CRAN. It +provides some simple extensions to R to facilitate running R scripts +through the CGI interface to a web server, and allows submission of data +using both GET and POST methods. It is easily installed using Apache +under Linux and in principle should run on any platform that supports R +and a web server provided that the installer has the necessary security +permissions. David’s paper “CGIwithR: Facilities for Processing Web +Forms Using R” was published in the Journal of Statistical Software +(http://www.jstatsoft.org/v08/i10/). The package is now +maintained by Duncan Temple Lang and +has a web page at http://www.omegahat.org/CGIwithR/. +

+ +

Jeff Horner is working on the R/Apache Integration Project which embeds +the R interpreter inside Apache 2 (and beyond). A tutorial and +presentation are available from the project web page at +http://biostat.mc.vanderbilt.edu/twiki/bin/view/Main/RApacheProject. +

+

Rserve is a project +actively developed by Simon Urbanek. It implements a TCP/IP server which +allows other programs to use facilities of R. Clients are available from +the web site for Java and C++ (and could be written for other languages +that support TCP/IP sockets). +

+ +

Two projects use PHP to provide a web interface to R. +R_PHP_Online by Steve Chen (though +it is unclear if this project is still active) is somewhat similar to +the above Rcgi and Rweb. R-php is actively developed by Alfredo Pontillo and Angelo Mineo and +provides both a web interface to R and a set of pre-specified analyses +that need no R code input. +

+

webbioc is “an integrated web +interface for doing microarray analysis using several of the +Bioconductor packages” and is designed to be installed at local sites +as a shared computing resource. +

+

Rwui is a web application to +create user-friendly web interfaces for R scripts. All code for the web +interface is created automatically. There is no need for the user to do +any extra scripting or learn any new scripting techniques. +

+

The R.rsp package by Henrik Bengtsson introduces “R Server +Pages”. Analogous to Java Server Pages, an R server page is typically +HTML with embedded R code that gets evaluated when the page is +requested. The package includes an internal cross-platform +HTTP server implemented in Tcl, so provides a good framework +for including web-based user interfaces in packages. The approach is +similar to the use of the brew package with +Rapache with the advantage of cross-platform +support and easy installation. +

+

The Rook package by Jeffrey Horner provides a web server +interface borrowing heavily from Ruby’s Rack project. +

+

Finally, Concerto is +a user friendly open-source Web Interface to R developed at the +Psychometrics Centre of Cambridge University. It was designed as an +online platform to design and run Computerized Adaptive Tests, but can +be also used as a general-purpose R Web Interface. It allows R users +with no programming or web designing background to quickly develop +flexible and powerful online applications, websites, and psychometrics +tests. To maximize its reliability, security, and performance, Concerto +relies on the popular and reliable open-source elements such as MySQL +server (exchange and storage of the data), Rstudio (R code designing and testing, file management), CKEditor +(HTML Layer design), and PHP. +

+ + +
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

5 R Add-On Packages

+ + + + + + + + + + +
+ + + +

5.1 Which add-on packages exist for R?

+ + + + + + + + + +
+ + + +

5.1.1 Add-on packages in R

+ +

The R distribution comes with the following packages: +

+
+
base
+

Base R functions (and datasets before R 2.0.0). +

+
compiler
+

R byte code compiler (added in R 2.13.0). +

+
datasets
+

Base R datasets (added in R 2.0.0). +

+
grDevices
+

Graphics devices for base and grid graphics (added in R 2.0.0). +

+
graphics
+

R functions for base graphics. +

+
grid
+

A rewrite of the graphics layout capabilities, plus some support for +interaction. +

+
methods
+

Formally defined methods and classes for R objects, plus other +programming tools, as described in the Green Book. +

+
parallel
+

Support for parallel computation, including by forking and by sockets, +and random-number generation (added in R 2.14.0). +

+
splines
+

Regression spline functions and classes. +

+
stats
+

R statistical functions. +

+
stats4
+

Statistical functions using S4 classes. +

+
tcltk
+

Interface and language bindings to Tcl/Tk GUI elements. +

+
tools
+

Tools for package development and administration. +

+
utils
+

R utility functions. +

+
+

These “base packages” were substantially reorganized in R 1.9.0. The +former base was split into the four packages base, +graphics, stats, and utils. Packages ctest, +eda, modreg, mva, nls, stepfun and +ts were merged into stats, package lqs returned to the +recommended package MASS, and package mle moved to +stats4. +

+
+ + + +

5.1.2 Add-on packages from CRAN

+ +

The CRAN src/contrib area contains a wealth of add-on +packages, including the following recommended packages which are +to be included in all binary distributions of R. +

+
+
KernSmooth
+

Functions for kernel smoothing (and density estimation) corresponding to +the book “Kernel Smoothing” by M. P. Wand and M. C. Jones, 1995. +

+
MASS
+

Functions and datasets from the main package of Venables and Ripley, +“Modern Applied Statistics with S”. +(Contained in the VR bundle for R versions prior to 2.10.0.) +

+
Matrix
+

A Matrix package. +(Recommended for R 2.9.0 or later.) +

+
boot
+

Functions and datasets for bootstrapping from the book “Bootstrap +Methods and Their Applications” by A. C. Davison and D. V. Hinkley, +1997, Cambridge University Press. +

+
class
+

Functions for classification (k-nearest neighbor and LVQ). +(Contained in the VR bundle for R versions prior to 2.10.0.) +

+
cluster
+

Functions for cluster analysis. +

+
codetools
+

Code analysis tools. +(Recommended for R 2.5.0 or later.) +

+
foreign
+

Functions for reading and writing data stored by statistical software +like Minitab, S, SAS, SPSS, Stata, Systat, etc. +

+
lattice
+

Lattice graphics, an implementation of Trellis Graphics functions. +

+
mgcv
+

Routines for GAMs and other generalized ridge regression problems with +multiple smoothing parameter selection by GCV or UBRE. +

+
nlme
+

Fit and compare Gaussian linear and nonlinear mixed-effects models. +

+
nnet
+

Software for single hidden layer perceptrons (“feed-forward neural +networks”), and for multinomial log-linear models. +(Contained in the VR bundle for R versions prior to 2.10.0.) +

+
rpart
+

Recursive PARTitioning and regression trees. +

+
spatial
+

Functions for kriging and point pattern analysis from “Modern Applied +Statistics with S” by W. Venables and B. Ripley. +(Contained in the VR bundle for R versions prior to 2.10.0.) +

+
survival
+

Functions for survival analysis, including penalized likelihood. +

+
+

See the CRAN +contributed packages page for more information. +

+

Many of these packages are categorized into +CRAN Task Views, allowing +to browse packages by topic and providing tools to automatically install +all packages for special areas of interest. +

+

Some CRAN packages that do not build out of the box on Windows, +require additional software, or are shipping third party libraries for +Windows cannot be made available on CRAN in form of a Windows binary +packages. Nevertheless, some of these packages are available at the +“CRAN extras” repository at +https://www.stats.ox.ac.uk/pub/RWin/ kindly provided by Brian +D. Ripley. Note that this repository is a default repository for recent +versions of R for Windows. +

+
+ + + +

5.1.3 Add-on packages from Omegahat

+ +

The Omega Project for Statistical +Computing provides a variety of open-source software for statistical +applications, with special emphasis on web-based software, Java, the +Java virtual machine, and distributed computing. A CRAN style +R package repository is available via http://www.omegahat.org/R/. +See http://www.omegahat.org/ for information on most R packages +available from the Omega project. +

+
+ + + +

5.1.4 Add-on packages from Bioconductor

+ +

Bioconductor is an open source and +open development software project for the analysis and comprehension of +genomic data. Most Bioconductor components are distributed as R add-on +packages. Initially most of the +Bioconductor software packages +focused primarily on DNA microarray data analysis. As the +project has matured, the functional scope of the software packages +broadened to include the analysis of all types of genomic data, such as +SAGE, sequence, or SNP data. In addition, there are metadata +(annotation, CDF and probe) and experiment data packages. See +https://www.bioconductor.org/download/ for available packages and a +complete taxonomy via BioC Views. +

+
+ + + +

5.1.5 Other add-on packages

+ +

Many more packages are available from places other than the three +default repositories discussed above (CRAN, Bioconductor and +Omegahat). In particular, R-Forge provides a CRAN style repository +at https://R-Forge.R-project.org/. +

+

More code has been posted to the R-help mailing list, and can be +obtained from the mailing list archive. +

+
+ + + +

5.2 How can add-on packages be installed?

+ +

(Unix-like only.) The add-on packages on CRAN come as gzipped tar +files named pkg_version.tar.gz, which may in fact be +“bundles” containing more than one package. Let path be the +path to such a package file. Provided that tar and +gzip are available on your system, type +

+
+
$ R CMD INSTALL path/pkg_version.tar.gz
+
+ +

at the shell prompt to install to the library tree rooted at the first +directory in your library search path (see the help page for +.libPaths() for details on how the search path is determined). +

+

To install to another tree (e.g., your private one), use +

+
+
$ R CMD INSTALL -l lib path/pkg_version.tar.gz
+
+ +

where lib gives the path to the library tree to install to. +

+

Even more conveniently, you can install and automatically update +packages from within R if you have access to repositories such as +CRAN. See the help page for available.packages() for more +information. +

+ + + + + +
+ + + +

5.3 How can add-on packages be used?

+ +

To find out which additional packages are available on your system, type +

+
+
library()
+
+ +

at the R prompt. +

+

This produces something like +

+
+
+
+
Packages in `/home/me/lib/R':
+
+mystuff       My own R functions, nicely packaged but not documented
+
+Packages in `/usr/local/lib/R/library':
+
+KernSmooth    Functions for kernel smoothing for Wand & Jones (1995)
+MASS          Main Package of Venables and Ripley's MASS
+Matrix        Sparse and Dense Matrix Classes and Methods
+base          The R Base package
+boot          Bootstrap R (S-Plus) Functions (Canty)
+class         Functions for Classification
+cluster       Functions for clustering (by Rousseeuw et al.)
+codetools     Code Analysis Tools for R
+datasets      The R Datasets Package
+foreign       Read Data Stored by Minitab, S, SAS, SPSS, Stata, Systat,
+              dBase, ...
+grDevices     The R Graphics Devices and Support for Colours and Fonts
+graphics      The R Graphics Package
+grid          The Grid Graphics Package
+lattice       Lattice Graphics
+methods       Formal Methods and Classes
+mgcv          GAMs with GCV/AIC/REML smoothness estimation and GAMMs
+              by PQL
+nlme          Linear and Nonlinear Mixed Effects Models
+nnet          Feed-forward Neural Networks and Multinomial Log-Linear
+              Models
+rpart         Recursive Partitioning
+spatial       Functions for Kriging and Point Pattern Analysis
+splines       Regression Spline Functions and Classes
+stats         The R Stats Package
+stats4        Statistical functions using S4 Classes
+survival      Survival analysis, including penalised likelihood
+tcltk         Tcl/Tk Interface
+tools         Tools for Package Development
+utils         The R Utils Package
+
+
+
+ +

You can “load” the installed package pkg by +

+
+
library(pkg)
+
+ +

You can then find out which functions it provides by typing one of +

+
+
library(help = pkg)
+help(package = pkg)
+
+ +

You can unload the loaded package pkg by +

+
+
detach("package:pkg", unload = TRUE)
+
+ +

(where unload = TRUE is needed only for packages with a +namespace, see ?unload). +

+
+ + + +

5.4 How can add-on packages be removed?

+ +

Use +

+
+
$ R CMD REMOVE pkg_1pkg_n
+
+ +

to remove the packages pkg_1, …, pkg_n from the +library tree rooted at the first directory given in R_LIBS if this +is set and non-null, and from the default library otherwise. (Versions +of R prior to 1.3.0 removed from the default library by default.) +

+

To remove from library lib, do +

+
+
$ R CMD REMOVE -l lib pkg_1pkg_n
+
+ +
+ + + +

5.5 How can I create an R package?

+ +

A package consists of a subdirectory containing a file +DESCRIPTION and the subdirectories R, data, +demo, exec, inst, man, po, +src, and tests (some of which can be missing). The +package subdirectory may also contain files INDEX, +NAMESPACE, configure, cleanup, LICENSE, +LICENCE, COPYING and NEWS. +

+

See section “Creating R packages” in Writing R Extensions, for +details. This manual is included in the R distribution, see What documentation exists for R?, and gives information on package +structure, the configure and cleanup mechanisms, and on automated +package checking and building. +

+

R version 1.3.0 has added the function package.skeleton() which +will set up directories, save data and code, and create skeleton help +files for a set of R functions and datasets. +

+

See What is CRAN?, for information on uploading a package to CRAN. +

+
+ + + +

5.6 How can I contribute to R?

+ +

R is in active development and there is always a risk of bugs creeping +in. Also, the developers do not have access to all possible machines +capable of running R. So, simply using it and communicating problems is +certainly of great value. +

+

The R Developer Page acts as an +intermediate repository for more or less finalized ideas and plans for +the R statistical system. It contains (pointers to) TODO lists, RFCs, +various other writeups, ideas lists, and SVN miscellanea. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

6 R and Emacs

+ + + + + + + +
+ + + +

6.1 Is there Emacs support for R?

+ +

There is an Emacs package called ESS (“Emacs Speaks +Statistics”) which provides a standard interface between statistical +programs and statistical processes. It is intended to provide +assistance for interactive statistical programming and data analysis. +Languages supported include: S dialects (R, S 3/4, and S-PLUS +3.x/4.x/5.x/6.x/7.x), LispStat dialects (XLispStat, ViSta), SAS, Stata, +and BUGS. +

+

ESS grew out of the need for bug fixes and extensions to +S-mode 4.8 (which was a GNU Emacs interface to S/S-PLUS +version 3 only). The current set of developers desired support for +XEmacs, R, S4, and MS Windows. In addition, with new modes being +developed for R, Stata, and SAS, it was felt that a unifying interface +and framework for the user interface would benefit both the user and the +developer, by helping both groups conform to standard Emacs usage. The +end result is an increase in efficiency for statistical programming and +data analysis, over the usual tools. +

+

R support contains code for editing R source code (syntactic indentation +and highlighting of source code, partial evaluations of code, loading +and error-checking of code, and source code revision maintenance) and +documentation (syntactic indentation and highlighting of source code, +sending examples to running ESS process, and previewing), +interacting with an inferior R process from within Emacs (command-line +editing, searchable command history, command-line completion of R object +and file names, quick access to object and search lists, transcript +recording, and an interface to the help system), and transcript +manipulation (recording and saving transcript files, manipulating and +editing saved transcripts, and re-evaluating commands from transcript +files). +

+

The latest stable version of ESS is available via CRAN or +the ESS web page. The HTML version +of the documentation can be found at https://stat.ethz.ch/ESS/. +

+

ESS comes with detailed installation instructions. +

+

For help with ESS, send email to +ESS-help@stat.math.ethz.ch. +

+

Please send bug reports and suggestions on ESS to +ESS-bugs@stat.math.ethz.ch. The easiest way to do this from is +within Emacs by typing M-x ess-submit-bug-report or using the +[ESS] or [iESS] pulldown menus. +

+
+ + + +

6.2 Should I run R from within Emacs?

+ +

Yes, definitely. Inferior R mode provides a readline/history +mechanism, object name completion, and syntax-based highlighting of the +interaction buffer using Font Lock mode, as well as a very convenient +interface to the R help system. +

+

Of course, it also integrates nicely with the mechanisms for editing R +source using Emacs. One can write code in one Emacs buffer and send +whole or parts of it for execution to R; this is helpful for both data +analysis and programming. One can also seamlessly integrate with a +revision control system, in order to maintain a log of changes in your +programs and data, as well as to allow for the retrieval of past +versions of the code. +

+

In addition, it allows you to keep a record of your session, which can +also be used for error recovery through the use of the transcript mode. +

+

To specify command line arguments for the inferior R process, use +C-u M-x R for starting R. +

+ +
+ + + +

6.3 Debugging R from within Emacs

+ +

To debug R “from within Emacs”, there are several possibilities. To +use the Emacs GUD (Grand Unified Debugger) library with the recommended +debugger GDB, type M-x gdb and give the path to the R +binary as argument. At the gdb prompt, set +R_HOME and other environment variables as needed (using e.g. +set env R_HOME /path/to/R/, but see also below), and start the +binary with the desired arguments (e.g., run --quiet). +

+

If you have ESS, you can do C-u M-x R RET - d +SPC g d b RET to start an inferior R process with arguments +-d gdb. +

+

A third option is to start an inferior R process via ESS +(M-x R) and then start GUD (M-x gdb) giving the R binary +(using its full path name) as the program to debug. Use the program +ps to find the process number of the currently running R +process then use the attach command in gdb to attach it to that +process. One advantage of this method is that you have separate +*R* and *gud-gdb* windows. Within the *R* window +you have all the ESS facilities, such as object-name +completion, that we know and love. +

+

When using GUD mode for debugging from within Emacs, you may find it +most convenient to use the directory with your code in it as the current +working directory and then make a symbolic link from that directory to +the R binary. That way .gdbinit can stay in the directory with +the code and be used to set up the environment and the search paths for +the source, e.g. as follows: +

+
+
set env R_HOME /opt/R
+set env R_PAPERSIZE letter
+set env R_PRINTCMD lpr
+dir /opt/R/src/appl
+dir /opt/R/src/main
+dir /opt/R/src/nmath
+dir /opt/R/src/unix
+
+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

7 R Miscellanea

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +

7.1 How can I set components of a list to NULL?

+ +

You can use +

+
+
x[i] <- list(NULL)
+
+ +

to set component i of the list x to NULL, similarly +for named components. Do not set x[i] or x[[i]] to +NULL, because this will remove the corresponding component from +the list. +

+

For dropping the row names of a matrix x, it may be easier to use +rownames(x) <- NULL, similarly for column names. +

+
+ + + +

7.2 How can I save my workspace?

+ +

save.image() saves the objects in the user’s .GlobalEnv to +the file .RData in the R startup directory. (This is also what +happens after q("yes").) Using save.image(file) one +can save the image under a different name. +

+
+ + + +

7.3 How can I clean up my workspace?

+ +

To remove all objects in the currently active environment (typically +.GlobalEnv), you can do +

+
+
rm(list = ls(all = TRUE))
+
+ +

(Without all = TRUE, only the objects with names not starting +with a ‘.’ are removed.) +

+
+ + + +

7.4 How can I get eval() and D() to work?

+ +

Strange things will happen if you use eval(print(x), envir = e) +or D(x^2, "x"). The first one will either tell you that +"x" is not found, or print the value of the wrong x. +The other one will likely return zero if x exists, and an error +otherwise. +

+

This is because in both cases, the first argument is evaluated in the +calling environment first. The result (which should be an object of +mode "expression" or "call") is then evaluated or +differentiated. What you (most likely) really want is obtained by +“quoting” the first argument upon surrounding it with +expression(). For example, +

+
+
R> D(expression(x^2), "x")
+2 * x
+
+ +

Although this behavior may initially seem to be rather strange, it is +perfectly logical. The “intuitive” behavior could easily be +implemented, but problems would arise whenever the expression is +contained in a variable, passed as a parameter, or is the result of a +function call. Consider for instance the semantics in cases like +

+
+
D2 <- function(e, n) D(D(e, n), n)
+
+ +

or +

+
+
g <- function(y) eval(substitute(y), sys.frame(sys.parent(n = 2)))
+g(a * b)
+
+ +

See the help page for deriv() for more examples. +

+
+ + + +

7.5 Why do my matrices lose dimensions?

+ +

When a matrix with a single row or column is created by a subscripting +operation, e.g., row <- mat[2, ], it is by default turned into a +vector. In a similar way if an array with dimension, say, 2 x 3 x 1 x 4 is created by subscripting it will be coerced into a 2 x 3 x 4 +array, losing the unnecessary dimension. After much discussion this has +been determined to be a feature. +

+

To prevent this happening, add the option drop = FALSE to the +subscripting. For example, +

+
+
rowmatrix <- mat[2, , drop = FALSE]  # creates a row matrix
+colmatrix <- mat[, 2, drop = FALSE]  # creates a column matrix
+a <- b[1, 1, 1, drop = FALSE]        # creates a 1 x 1 x 1 array
+
+ +

The drop = FALSE option should be used defensively when +programming. For example, the statement +

+
+
somerows <- mat[index, ]
+
+ +

will return a vector rather than a matrix if index happens to +have length 1, causing errors later in the code. It should probably be +rewritten as +

+
+
somerows <- mat[index, , drop = FALSE]
+
+ +
+ + + +

7.6 How does autoloading work?

+ +

R has a special environment called .AutoloadEnv. Using +autoload(name, pkg), where name and +pkg are strings giving the names of an object and the package +containing it, stores some information in this environment. When R +tries to evaluate name, it loads the corresponding package +pkg and reevaluates name in the new package’s +environment. +

+

Using this mechanism makes R behave as if the package was loaded, but +does not occupy memory (yet). +

+

See the help page for autoload() for a very nice example. +

+
+ + + +

7.7 How should I set options?

+ +

The function options() allows setting and examining a variety of +global “options” which affect the way in which R computes and displays +its results. The variable .Options holds the current values of +these options, but should never directly be assigned to unless you want +to drive yourself crazy—simply pretend that it is a “read-only” +variable. +

+

For example, given +

+
+
test1 <- function(x = pi, dig = 3) {
+  oo <- options(digits = dig); on.exit(options(oo));
+  cat(.Options$digits, x, "\n")
+}
+test2 <- function(x = pi, dig = 3) {
+  .Options$digits <- dig
+  cat(.Options$digits, x, "\n")
+}
+
+ +

we obtain: +

+
+
R> test1()
+3 3.14 
+R> test2()
+3 3.141593
+
+ +

What is really used is the global value of .Options, and +using options(OPT = VAL) correctly updates it. Local copies of +.Options, either in .GlobalEnv or in a function +environment (frame), are just silently disregarded. +

+
+ + + +

7.8 How do file names work in Windows?

+ +

As R uses C-style string handling, ‘\’ is treated as an escape +character, so that for example one can enter a newline as ‘\n’. +When you really need a ‘\’, you have to escape it with another +‘\’. +

+

Thus, in filenames use something like "c:\\data\\money.dat". You +can also replace ‘\’ by ‘/’ ("c:/data/money.dat"). +

+
+ + + +

7.9 Why does plotting give a color allocation error?

+ +

On an X11 device, plotting sometimes, e.g., when running +demo("image"), results in “Error: color allocation error”. +This is an X problem, and only indirectly related to R. It occurs when +applications started prior to R have used all the available colors. +(How many colors are available depends on the X configuration; sometimes +only 256 colors can be used.) +

+

One application which is notorious for “eating” colors is Netscape. +If the problem occurs when Netscape is running, try (re)starting it with +either the -no-install (to use the default colormap) or the +-install (to install a private colormap) option. +

+

You could also set the colortype of X11() to +"pseudo.cube" rather than the default "pseudo". See the +help page for X11() for more information. +

+ + + +
+ + + +

7.10 How do I convert factors to numeric?

+ +

It may happen that when reading numeric data into R (usually, when +reading in a file), they come in as factors. If f is such a +factor object, you can use +

+
+
as.numeric(as.character(f))
+
+ +

to get the numbers back. More efficient, but harder to remember, is +

+
+
as.numeric(levels(f))[as.integer(f)]
+
+ +

In any case, do not call as.numeric() or their likes directly for +the task at hand (as as.numeric() or unclass() give the +internal codes). +

+
+ + + +

7.11 Are Trellis displays implemented in R?

+ +

The recommended package lattice (which is based on base +package grid) provides graphical functionality that is compatible +with most Trellis commands. +

+

You could also look at coplot() and dotchart() which might +do at least some of what you want. Note also that the R version of +pairs() is fairly general and provides most of the functionality +of splom(), and that R’s default plot method has an argument +asp allowing to specify (and fix against device resizing) the +aspect ratio of the plot. +

+

(Because the word “Trellis” has been claimed as a trademark we do not +use it in R. The name “lattice” has been chosen for the R +equivalent.) +

+
+ + + +

7.12 What are the enclosing and parent environments?

+ +

Inside a function you may want to access variables in two additional +environments: the one that the function was defined in (“enclosing”), +and the one it was invoked in (“parent”). +

+

If you create a function at the command line or load it in a package its +enclosing environment is the global workspace. If you define a function +f() inside another function g() its enclosing environment +is the environment inside g(). The enclosing environment for a +function is fixed when the function is created. You can find out the +enclosing environment for a function f() using +environment(f). +

+

The “parent” environment, on the other hand, is defined when you +invoke a function. If you invoke lm() at the command line its +parent environment is the global workspace, if you invoke it inside a +function f() then its parent environment is the environment +inside f(). You can find out the parent environment for an +invocation of a function by using parent.frame() or +sys.frame(sys.parent()). +

+

So for most user-visible functions the enclosing environment will be the +global workspace, since that is where most functions are defined. The +parent environment will be wherever the function happens to be called +from. If a function f() is defined inside another function +g() it will probably be used inside g() as well, so its +parent environment and enclosing environment will probably be the same. +

+

Parent environments are important because things like model formulas +need to be evaluated in the environment the function was called from, +since that’s where all the variables will be available. This relies on +the parent environment being potentially different with each invocation. +

+

Enclosing environments are important because a function can use +variables in the enclosing environment to share information with other +functions or with other invocations of itself (see the section on +lexical scoping). This relies on the enclosing environment being the +same each time the function is invoked. (In C this would be done with +static variables.) +

+

Scoping is hard. Looking at examples helps. It is particularly +instructive to look at examples that work differently in R and S and try +to see why they differ. One way to describe the scoping differences +between R and S is to say that in S the enclosing environment is +always the global workspace, but in R the enclosing environment +is wherever the function was created. +

+
+ + + +

7.13 How can I substitute into a plot label?

+ +

Often, it is desired to use the value of an R object in a plot label, +e.g., a title. This is easily accomplished using paste() if the +label is a simple character string, but not always obvious in case the +label is an expression (for refined mathematical annotation). In such a +case, either use parse() on your pasted character string or use +substitute() on an expression. For example, if ahat is an +estimator of your parameter a of interest, use +

+
+
title(substitute(hat(a) == ahat, list(ahat = ahat)))
+
+ +

(note that it is ‘==’ and not ‘=’). Sometimes bquote() +gives a more compact form, e.g., +

+
+
title(bquote(hat(a) = .(ahat)))
+
+ +

where subexpressions enclosed in ‘.()’ are replaced by their +values. +

+

There are more examples in the mailing list archives. +

+
+ + + +

7.14 What are valid names?

+ +

When creating data frames using data.frame() or +read.table(), R by default ensures that the variable names are +syntactically valid. (The argument check.names to these +functions controls whether variable names are checked and adjusted by +make.names() if needed.) +

+

To understand what names are “valid”, one needs to take into account +that the term “name” is used in several different (but related) ways +in the language: +

+
    +
  1. A syntactic name is a string the parser interprets as this type +of expression. It consists of letters, numbers, and the dot and (for +versions of R at least 1.9.0) underscore characters, and starts with +either a letter or a dot not followed by a number. Reserved words are +not syntactic names. +
  2. An object name is a string associated with an object that is +assigned in an expression either by having the object name on the left +of an assignment operation or as an argument to the assign() +function. It is usually a syntactic name as well, but can be any +non-empty string if it is quoted (and it is always quoted in the call to +assign()). + +
  3. An argument name is what appears to the left of the equals sign +when supplying an argument in a function call (for example, +f(trim=.5)). Argument names are also usually syntactic names, +but again can be anything if they are quoted. + +
  4. An element name is a string that identifies a piece of an object +(a component of a list, for example.) When it is used on the right of +the ‘$’ operator, it must be a syntactic name, or quoted. +Otherwise, element names can be any strings. (When an object is used as +a database, as in a call to eval() or attach(), the +element names become object names.) + +
  5. Finally, a file name is a string identifying a file in the +operating system for reading, writing, etc. It really has nothing much +to do with names in the language, but it is traditional to call these +strings file “names”. +
+ +
+ + + +

7.15 Are GAMs implemented in R?

+ +

Package gam from CRAN implements all the Generalized +Additive Models (GAM) functionality as described in the GAM chapter of +the White Book. In particular, it implements backfitting with both +local regression and smoothing splines, and is extendable. There is a +gam() function for GAMs in package mgcv, but it is not +an exact clone of what is described in the White Book (no lo() +for example). Package gss can fit spline-based GAMs too. And +if you can accept regression splines you can use glm(). For +Gaussian GAMs you can use bruto() from package mda. +

+
+ + + +

7.16 Why is the output not printed when I source() a file?

+ +

Most R commands do not generate any output. The command +

+
+
1+1
+
+ +

computes the value 2 and returns it; the command +

+
+
summary(glm(y~x+z, family=binomial))
+
+ +

fits a logistic regression model, computes some summary information and +returns an object of class "summary.glm" (see How should I write summary methods?). +

+

If you type ‘1+1’ or ‘summary(glm(y~x+z, family=binomial))’ at +the command line the returned value is automatically printed (unless it +is invisible()), but in other circumstances, such as in a +source()d file or inside a function it isn’t printed unless you +specifically print it. +

+

To print the value use +

+
+
print(1+1)
+
+ +

or +

+
+
print(summary(glm(y~x+z, family=binomial)))
+
+ +

instead, or use source(file, echo=TRUE). +

+
+ + + +

7.17 Why does outer() behave strangely with my function?

+ +

As the help for outer() indicates, it does not work on arbitrary +functions the way the apply() family does. It requires functions +that are vectorized to work elementwise on arrays. As you can see by +looking at the code, outer(x, y, FUN) creates two large vectors +containing every possible combination of elements of x and +y and then passes this to FUN all at once. Your function +probably cannot handle two large vectors as parameters. +

+

If you have a function that cannot handle two vectors but can handle two +scalars, then you can still use outer() but you will need to wrap +your function up first, to simulate vectorized behavior. Suppose your +function is +

+
+
foo <- function(x, y, happy) {
+  stopifnot(length(x) == 1, length(y) == 1) # scalars only!
+  (x + y) * happy
+}
+
+ +

If you define the general function +

+
+
wrapper <- function(x, y, my.fun, ...) {
+  sapply(seq_along(x), FUN = function(i) my.fun(x[i], y[i], ...))
+}
+
+ +

then you can use outer() by writing, e.g., +

+
+
outer(1:4, 1:2, FUN = wrapper, my.fun = foo, happy = 10)
+
+ +

Scalar functions can also be vectorized using Vectorize(). +

+
+ + + +

7.18 Why does the output from anova() depend on the order of factors in the model?

+ +

In a model such as ~A+B+A:B, R will report the difference in sums +of squares between the models ~1, ~A, ~A+B and +~A+B+A:B. If the model were ~B+A+A:B, R would report +differences between ~1, ~B, ~A+B, and +~A+B+A:B . In the first case the sum of squares for A is +comparing ~1 and ~A, in the second case it is comparing +~B and ~B+A. In a non-orthogonal design (i.e., most +unbalanced designs) these comparisons are (conceptually and numerically) +different. +

+

Some packages report instead the sums of squares based on comparing the +full model to the models with each factor removed one at a time (the +famous ‘Type III sums of squares’ from SAS, for example). These do not +depend on the order of factors in the model. The question of which set +of sums of squares is the Right Thing provokes low-level holy wars on +R-help from time to time. +

+

There is no need to be agitated about the particular sums of squares +that R reports. You can compute your favorite sums of squares quite +easily. Any two models can be compared with anova(model1, +model2), and drop1(model1) will show the sums of +squares resulting from dropping single terms. +

+
+ + + +

7.19 How do I produce PNG graphics in batch mode?

+ +

Under a Unix-like, if your installation supports the +type="cairo" option to the png() device there should be no +problems, and the default settings should just work. This option is not +available for versions of R prior to 2.7.0, or without support for +cairo. From R 2.7.0 png() by default uses the Quartz device +on OS X, and that too works in batch mode. +

+

Earlier versions of the png() device used the X11 driver, which +is a problem in batch mode or for remote operation. If you have +Ghostscript you can use bitmap(), which produces a PostScript or +PDF file then converts it to any bitmap format supported by Ghostscript. +On some installations this produces ugly output, on others it is +perfectly satisfactory. Many systems now come with Xvfb from +X.Org (possibly as an optional +install), which is an X11 server that does not require a screen; and +there is the GDD package from CRAN, which produces PNG, +JPEG and GIF bitmaps without X11. +

+
+ + + +

7.20 How can I get command line editing to work?

+ +

The Unix-like command-line interface to R can only provide the inbuilt +command line editor which allows recall, editing and re-submission of +prior commands provided that the GNU readline library is +available at the time R is configured for compilation. Note that the +‘development’ version of readline including the appropriate headers is +needed: users of Linux binary distributions will need to install +packages such as libreadline-dev (Debian) or +readline-devel (Red Hat). +

+
+ + + +

7.21 How can I turn a string into a variable?

+ +

If you have +

+
+
varname <- c("a", "b", "d")
+
+ +

you can do +

+
+
get(varname[1]) + 2
+
+ +

for +

+
+
a + 2
+
+ +

or +

+
+
assign(varname[1], 2 + 2)
+
+ +

for +

+
+
a <- 2 + 2
+
+ +

or +

+
+
eval(substitute(lm(y ~ x + variable),
+                list(variable = as.name(varname[1]))))
+
+ +

for +

+
+
lm(y ~ x + a)
+
+ +

At least in the first two cases it is often easier to just use a list, +and then you can easily index it by name +

+
+
vars <- list(a = 1:10, b = rnorm(100), d = LETTERS)
+vars[["a"]]
+
+ +

without any of this messing about. +

+
+ + + +

7.22 Why do lattice/trellis graphics not work?

+ +

The most likely reason is that you forgot to tell R to display the +graph. Lattice functions such as xyplot() create a graph object, +but do not display it (the same is true of ggplot2 graphics, +and Trellis graphics in S-PLUS). The print() method for the +graph object produces the actual display. When you use these functions +interactively at the command line, the result is automatically printed, +but in source() or inside your own functions you will need an +explicit print() statement. +

+
+ + + +

7.23 How can I sort the rows of a data frame?

+ +

To sort the rows within a data frame, with respect to the values in one +or more of the columns, simply use order() (e.g., +DF[order(DF$a, DF[["b"]]), ] to sort the data frame DF on +columns named a and b). +

+
+ + + +

7.24 Why does the help.start() search engine not work?

+ +

The browser-based search engine in help.start() utilizes a Java +applet. In order for this to function properly, a compatible version of +Java must installed on your system and linked to your browser, and both +Java and JavaScript need to be enabled in your browser. +

+

There have been a number of compatibility issues with versions of Java +and of browsers. +For further details please consult section “Enabling search in HTML +help” in R Installation and Administration. This manual is +included in the R distribution, see What documentation exists for R?, and its HTML version is linked from the HTML +search page. +

+
+ + + +

7.25 Why did my .Rprofile stop working when I updated R?

+ +

Did you read the NEWS file? For functions that are not in the +base package you need to specify the correct package namespace, +since the code will be run before the packages are loaded. E.g., +

+
+
ps.options(horizontal = FALSE)
+help.start()
+
+ +

needs to be +

+
+
grDevices::ps.options(horizontal = FALSE)
+utils::help.start()
+
+ +

(graphics::ps.options(horizontal = FALSE) in R 1.9.x). +

+
+ + + +

7.26 Where have all the methods gone?

+ +

Many functions, particularly S3 methods, are now hidden in namespaces. +This has the advantage that they cannot be called inadvertently with +arguments of the wrong class, but it makes them harder to view. +

+

To see the code for an S3 method (e.g., [.terms) use +

+
+
getS3method("[", "terms")
+
+ +

To see the code for an unexported function foo() in the namespace +of package "bar" use bar:::foo. Don’t use these +constructions to call unexported functions in your own code—they are +probably unexported for a reason and may change without warning. +

+
+ + + +

7.27 How can I create rotated axis labels?

+ +

To rotate axis labels (using base graphics), you need to use +text(), rather than mtext(), as the latter does not +support par("srt"). +

+
+
## Increase bottom margin to make room for rotated labels
+par(mar = c(7, 4, 4, 2) + 0.1)
+## Create plot with no x axis and no x axis label
+plot(1 : 8, xaxt = "n",  xlab = "")
+## Set up x axis with tick marks alone
+axis(1, labels = FALSE)
+## Create some text labels
+labels <- paste("Label", 1:8, sep = " ")
+## Plot x axis labels at default tick marks
+text(1:8, par("usr")[3] - 0.25, srt = 45, adj = 1,
+     labels = labels, xpd = TRUE)
+## Plot x axis label at line 6 (of 7)
+mtext(1, text = "X Axis Label", line = 6)
+
+ +

When plotting the x axis labels, we use srt = 45 for text +rotation angle, adj = 1 to place the right end of text at the +tick marks, and xpd = TRUE to allow for text outside the plot +region. You can adjust the value of the 0.25 offset as required +to move the axis labels up or down relative to the x axis. See +?par for more information. +

+

Also see Figure 1 and associated code in Paul Murrell (2003), +“Integrating grid Graphics Output with Base Graphics Output”, +R News, 3/2, 7–12. +

+
+ + + +

7.28 Why is read.table() so inefficient?

+ +

By default, read.table() needs to read in everything as character +data, and then try to figure out which variables to convert to numerics +or factors. For a large data set, this takes considerable amounts of +time and memory. Performance can substantially be improved by using the +colClasses argument to specify the classes to be assumed for the +columns of the table. +

+
+ + + +

7.29 What is the difference between package and library?

+ +

A package is a standardized collection of material extending R, +e.g. providing code, data, or documentation. A library is a +place (directory) where R knows to find packages it can use (i.e., which +were installed). R is told to use a package (to “load” it and +add it to the search path) via calls to the function library. +I.e., library() is employed to load a package from libraries +containing packages. +

+

See R Add-On Packages, for more details. See also Uwe Ligges (2003), +“R Help Desk: Package Management”, R News, 3/3, +37–39. +

+
+ + + +

7.30 I installed a package but the functions are not there

+ +

To actually use the package, it needs to be loaded using +library(). +

+

See R Add-On Packages and What is the difference between package and library? for more information. +

+
+ + + +

7.31 Why doesn’t R think these numbers are equal?

+ +

The only numbers that can be represented exactly in R’s numeric type +are integers and fractions whose denominator is a power of 2. Other +numbers have to be rounded to (typically) 53 binary digits accuracy. As a +result, two floating point numbers will not reliably be equal unless they +have been computed by the same algorithm, and not always even then. For +example +

+
+
R> a <- sqrt(2)
+R> a * a == 2
+[1] FALSE
+R> a * a - 2
+[1] 4.440892e-16
+
+ +

The function all.equal() compares two objects using a numeric +tolerance of .Machine$double.eps ^ 0.5. If you want much greater +accuracy than this you will need to consider error propagation +carefully. +

+

For more information, see e.g. David Goldberg (1991), “What Every +Computer Scientist Should Know About Floating-Point Arithmetic”, +ACM Computing Surveys, 23/1, 5–48, also available via +http://www.validlab.com/goldberg/paper.pdf. +

+

To quote from “The Elements of Programming Style” by Kernighan and +Plauger: +

+
+

10.0 times 0.1 is hardly ever 1.0. +

+ + +
+ + + +

7.32 How can I capture or ignore errors in a long simulation?

+ +

Use try(), which returns an object of class "try-error" +instead of an error, or preferably tryCatch(), where the return +value can be configured more flexibly. For example +

+
+
beta[i,] <- tryCatch(coef(lm(formula, data)),
+                     error = function(e) rep(NaN, 4))
+
+ +

would return the coefficients if the lm() call succeeded and +would return c(NaN, NaN, NaN, NaN) if it failed (presumably there +are supposed to be 4 coefficients in this example). +

+
+ + + +

7.33 Why are powers of negative numbers wrong?

+ +

You are probably seeing something like +

+
+
R> -2^2
+[1] -4
+
+ +

and misunderstanding the precedence rules for expressions in R. +Write +

+
+
R> (-2)^2
+[1] 4
+
+ +

to get the square of -2. +

+

The precedence rules are documented in ?Syntax, and to see how R +interprets an expression you can look at the parse tree +

+
+
R> as.list(quote(-2^2))
+[[1]]
+`-`
+
+[[2]]
+2^2
+
+ +
+ + + +

7.34 How can I save the result of each iteration in a loop into a separate file?

+ +

One way is to use paste() (or sprintf()) to concatenate a +stem filename and the iteration number while file.path() +constructs the path. For example, to save results into files +result1.rda, …, result100.rda in the subdirectory +Results of the current working directory, one can use +

+
+
for(i in 1:100) {
+  ## Calculations constructing "some_object" ...
+  fp <- file.path("Results", paste("result", i, ".rda", sep = ""))
+  save(list = "some_object", file = fp)
+}
+
+ +
+ + + +

7.35 Why are p-values not displayed when using lmer()?

+ +

Doug Bates has kindly provided an extensive response in a post to the +r-help list, which can be reviewed at +https://stat.ethz.ch/pipermail/r-help/2006-May/094765.html. +

+
+ + + +

7.36 Why are there unwanted borders, lines or grid-like artifacts when viewing a plot saved to a PS or PDF file?

+ +

This can occur when using functions such as polygon(), +filled.contour(), image() or other functions which may +call these internally. In the case of polygon(), you may observe +unwanted borders between the polygons even when setting the +border argument to NA or "transparent". +

+

The source of the problem is the PS/PDF viewer when the plot is +anti-aliased. The details for the solution will be different depending +upon the viewer used, the operating system and may change over time. +For some common viewers, consider the following: +

+
+
Acrobat Reader (cross platform) + +
+

There are options in Preferences to enable/disable text smoothing, image +smoothing and line art smoothing. +Disable line art smoothing. +

+
Preview (OS X) + +
+

There is an option in Preferences to enable/disable anti-aliasing of +text and line art. +Disable this option. +

+
GSview (cross platform) + +
+

There are settings for Text Alpha and Graphics Alpha. +Change Graphics Alpha from 4 bits to 1 bit to disable graphic +anti-aliasing. +

+
gv (Unix-like X) + +
+

There is an option to enable/disable anti-aliasing. +Disable this option. +

+
Evince (Linux/GNOME) + +
+

There is not an option to disable anti-aliasing in this viewer. +

+
Okular (Linux/KDE) + +
+

There is not an option in the GUI to enable/disable anti-aliasing. +From a console command line, use: +

+
$ kwriteconfig --file okularpartrc --group 'Dlg Performance' \
+               --key GraphicsAntialias Disabled
+
+

Then restart Okular. Change the final word to ‘Enabled’ to restore +the original setting. +

+
+ +
+ + + +

7.37 Why does backslash behave strangely inside strings?

+ +

This question most often comes up in relation to file names (see +How do file names work in Windows?) but it also happens that +people complain that they cannot seem to put a single ‘\’ character +into a text string unless it happens to be followed by certain other +characters. +

+

To understand this, you have to distinguish between character strings +and representations of character strings. Mostly, the +representation in R is just the string with a single or double quote at +either end, but there are strings that cannot be represented that way, +e.g., strings that themselves contain the quote character. So +

+
+
> str <- "This \"text\" is quoted"
+> str
+[1] "This \"text\" is quoted"
+> cat(str, "\n")
+This "text" is quoted
+
+ +

The escape sequences\"’ and ‘\n’ represent a double +quote and the newline character respectively. Printing text strings, +using print() or by typing the name at the prompt will use the +escape sequences too, but the cat() function will display the +string as-is. Notice that ‘"\n"’ is a one-character string, not +two; the backslash is not actually in the string, it is just generated +in the printed representation. +

+
+
> nchar("\n")
+[1] 1
+> substring("\n", 1, 1)
+[1] "\n"
+
+ +

So how do you put a backslash in a string? For this, you have to +escape the escape character. I.e., you have to double the backslash. +as in +

+
+
> cat("\\n", "\n")
+\n
+
+ +

Some functions, particularly those involving regular expression +matching, themselves use metacharacters, which may need to be escaped by +the backslash mechanism. In those cases you may need a quadruple +backslash to represent a single literal one. +

+

In versions of R up to 2.4.1 an unknown escape sequence like ‘\p’ +was quietly interpreted as just ‘p’. Current versions of R emit a +warning. +

+
+ + + +

7.38 How can I put error bars or confidence bands on my plot?

+ +

Some functions will display a particular kind of plot with error bars, +such as the bar.err() function in the agricolae +package, the plotCI() function in the gplots package, +the plotCI() and brkdn.plot() functions in the +plotrix package and the error.bars(), +error.crosses() and error.bars.by() functions in the +psych package. Within these types of functions, some will +accept the measures of dispersion (e.g., plotCI), some will +calculate the dispersion measures from the raw values (bar.err, +brkdn.plot), and some will do both (error.bars). Still +other functions will just display error bars, like the dispersion +function in the plotrix package. Most of the above functions +use the arrows() function in the base graphics package to +draw the error bars. +

+

The above functions all use the base graphics system. The grid and +lattice graphics systems also have specific functions for displaying +error bars, e.g., the grid.arrow() function in the grid +package, and the geom_errorbar(), geom_errorbarh(), +geom_pointrange(), geom_linerange(), +geom_crossbar() and geom_ribbon() functions in the +ggplot2 package. In the lattice system, error bars can be +displayed with Dotplot() or xYplot() in the +Hmisc package and segplot() in the +latticeExtra package. +

+
+ + + +

7.39 How do I create a plot with two y-axes?

+ +

Creating a graph with two y-axes, i.e., with two sorts of data that are +scaled to the same vertical size and showing separate vertical axes on +the left and right sides of the plot that reflect the original scales of +the data, is possible in R but is not recommended. The basic approach +for constructing such graphs is to use par(new=TRUE) (see +?par); functions twoord.plot() (in the plotrix +package) and doubleYScale() (in the latticeExtra +package) automate the process somewhat. +

+
+ + + +

7.40 How do I access the source code for a function?

+ +

In most cases, typing the name of the function will print its source +code. However, code is sometimes hidden in a namespace, or compiled. For +a complete overview on how to access source code, see Uwe Ligges (2006), +“Help Desk: Accessing the sources”, R News, 6/4, +43–45 (https://CRAN.R-project.org/doc/Rnews/Rnews_2006-4.pdf). +

+
+ + + +

7.41 Why does summary() report strange results for the R^2 estimate when I fit a linear model with no intercept?

+ +

As described in ?summary.lm, when the intercept is zero (e.g., +from y ~ x - 1 or y ~ x + 0), summary.lm() uses the +formula + R^2 = 1 - Sum(R[i]^2) / Sum((y[i])^2) +which is different from the usual + R^2 = 1 - Sum(R[i]^2) / Sum((y[i] - mean(y))^2). +There are several reasons for this: +

    +
  • Otherwise the R^2 could be negative (because the model with zero +intercept can fit worse than the constant-mean model it is +implicitly compared to). +
  • If you set the slope to zero in the model with a line through the +origin you get fitted values y*=0 +
  • The model with constant, non-zero mean is not nested in the model +with a line through the origin. +
+ +

All these come down to saying that if you know a priori that +E[Y]=0 when x=0 then the ‘null’ model that you should +compare to the fitted line, the model where x doesn’t explain any +of the variance, is the model where E[Y]=0 everywhere. (If you +don’t know a priori that E[Y]=0 when x=0, then you +probably shouldn’t be fitting a line through the origin.) +

+
+ + + +

7.42 Why is R apparently not releasing memory?

+ +

This question is often asked in different flavors along the lines of +“I have removed objects in R and run gc() and yet +ps/top still shows the R process using a lot of +memory”, often on Linux machines. +

+

This is an artifact of the way the operating system (OS) allocates +memory. In general it is common that the OS is not capable of +releasing all unused memory. In extreme cases it is possible that even +if R frees almost all its memory, the OS can not release any of it due +to its design and thus tools such as ps or top will +report substantial amount of resident RAM used by the R process even +though R has released all that memory. In general such tools do +not report the actual memory usage of the process but rather +what the OS is reserving for that process. +

+

The short answer is that this is a limitation of the memory allocator +in the operating system and there is nothing R can do about it. That +space is simply kept by the OS in the hope that R will ask for it +later. The following paragraph gives more in-depth answer with +technical details on how this happens. +

+

Most systems use two separate ways to allocate memory. For allocation +of large chunks they will use mmap to map memory into the +process address space. Such chunks can be released immediately when +they are completely free, because they can reside anywhere in the +virtual memory. However, this is a relatively expensive operation and +many OSes have a limit on the number of such allocated chunks, so this +is only used for allocating large memory regions. For smaller +allocations the system can expand the data segment of the process +(historically using the brk system call), but this whole area +is always contiguous. The OS can only move the end of this space, it +cannot create any “holes”. Since this operation is fairly cheap, it +is used for allocations of small pieces of memory. However, the +side-effect is that even if there is just one byte that is in use +at the end of the data segment, the OS cannot release any memory +at all, because it cannot change the address of that byte. This is +actually more common than it may seem, because allocating a lot of +intermediate objects, then allocating a result object and removing all +intermediate objects is a very common practice. Since the result is +allocated at the end it will prevent the OS from releasing any memory +used by the intermediate objects. In practice, this is not necessarily +a problem, because modern operating systems can page out unused +portions of the virtual memory so it does not necessarily reduce the +amount of real memory available for other applications. Typically, +small objects such as strings or pairlists will be affected by this +behavior, whereas large objects such as long vectors will be allocated +using mmap and thus not affected. On Linux (and possibly other +Unix-like systems) it is possible to use the mallinfo system call +(also see the mallinfo package) to +query the allocator about the layout of the allocations, including the +actually used memory as well as unused memory that cannot be released. +

+
+ + + +

7.43 How can I enable secure https downloads in R?

+ + +

When R transfers files over HTTP (e.g., using the +install.packages() or download.file() function), a +download method is chosen based on the download.file.method +option. There are several methods available and the default behavior if +no option is explicitly specified is to use R’s internal HTTP +implementation. In most circumstances this internal method will not +support HTTPS URLs so you will need to override the default: +this is done automatically for such URLs as from R 3.2.2. +

+

R versions 3.2.0 and greater include two download methods +("libcurl" and "wininet") that both support +HTTPS connections: we recommend that you use these methods. +The requisite code to add to .Rprofile or Rprofile.site is: +

+
+
options(download.file.method = "wininet", url.method = "wininet")     (Windows)
+options(download.file.method = "libcurl", url.method = "libcurl")     (Linux and OS X)
+
+ +

(Method "wininet" is the default on Windows as from R 3.2.2.) +

+

Note that the "libcurl" method may or may not have been compiled +in. In the case that it was not, i.e.. capabilities("libcurl") == +FALSE, we recommend method "wget" on Linux and "curl" on +OS X. It is possible that system versions of "libcurl", +wget or curl may have been compiled without +HTTPS support, but this is unlikely. As from R 3.3.0 +"libcurl" with HTTPS support is required except on +Windows. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

8 R Programming

+ + + + + + + + + +
+ + + +

8.1 How should I write summary methods?

+ +

Suppose you want to provide a summary method for class "foo". +Then summary.foo() should not print anything, but return an +object of class "summary.foo", and you should write a +method print.summary.foo() which nicely prints the summary +information and invisibly returns its object. This approach is +preferred over having summary.foo() print summary information and +return something useful, as sometimes you need to grab something +computed by summary() inside a function or similar. In such +cases you don’t want anything printed. +

+
+ + + +

8.2 How can I debug dynamically loaded code?

+ +

Roughly speaking, you need to start R inside the debugger, load the +code, send an interrupt, and then set the required breakpoints. +

+

See section “Finding entry points in dynamically loaded code” in +Writing R Extensions. This manual is included in the R +distribution, see What documentation exists for R?. +

+
+ + + +

8.3 How can I inspect R objects when debugging?

+ +

The most convenient way is to call R_PV from the symbolic +debugger. +

+

See section “Inspecting R objects when debugging” in Writing R +Extensions. +

+
+ + + +

8.4 How can I change compilation flags?

+ +

Suppose you have C code file for dynloading into R, but you want to use +R CMD SHLIB with compilation flags other than the default ones +(which were determined when R was built). +

+

Starting with R 2.1.0, users can provide personal Makevars configuration +files in $HOME/.R to override the default flags. +See section “Add-on packages” in R Installation and +Administration. +

+

For earlier versions of R, you could change the file +R_HOME/etc/Makeconf to reflect your preferences, or (at +least for systems using GNU Make) override them by the +environment variable MAKEFLAGS. +See section “Creating shared objects” in Writing R Extensions. +

+
+ + + +

8.5 How can I debug S4 methods?

+ +

Use the trace() function with argument signature= to add +calls to the browser or any other code to the method that will be +dispatched for the corresponding signature. See ?trace for +details. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents]

+
+ +

9 R Bugs

+ + + + + + +
+ +
+

+Next: , Previous: , Up: R Bugs   [Contents]

+
+ +

9.1 What is a bug?

+ +

If R executes an illegal instruction, or dies with an operating system +error message that indicates a problem in the program (as opposed to +something like “disk full”), then it is certainly a bug. If you call +.C(), .Fortran(), .External() or .Call() (or +.Internal()) yourself (or in a function you wrote), you can +always crash R by using wrong argument types (modes). This is not a +bug. +

+

Taking forever to complete a command can be a bug, but you must make +certain that it was really R’s fault. Some commands simply take a long +time. If the input was such that you know it should have been +processed quickly, report a bug. If you don’t know whether the command +should take a long time, find out by looking in the manual or by asking +for assistance. +

+

If a command you are familiar with causes an R error message in a case +where its usual definition ought to be reasonable, it is probably a bug. +If a command does the wrong thing, that is a bug. But be sure you know +for certain what it ought to have done. If you aren’t familiar with the +command, or don’t know for certain how the command is supposed to work, +then it might actually be working right. For example, people sometimes +think there is a bug in R’s mathematics because they don’t understand +how finite-precision arithmetic works. Rather than jumping to +conclusions, show the problem to someone who knows for certain. +Unexpected results of comparison of decimal numbers, for example +0.28 * 100 != 28 or 0.1 + 0.2 != 0.3, are not a bug. +See Why doesn't R think these numbers are equal?, for more details. +

+

Finally, a command’s intended definition may not be best for statistical +analysis. This is a very important sort of problem, but it is also a +matter of judgment. Also, it is easy to come to such a conclusion out +of ignorance of some of the existing features. It is probably best not +to complain about such a problem until you have checked the +documentation in the usual ways, feel confident that you understand it, +and know for certain that what you want is not available. If you are +not sure what the command is supposed to do after a careful reading of +the manual this indicates a bug in the manual. The manual’s job is to +make everything clear. It is just as important to report documentation +bugs as program bugs. However, we know that the introductory +documentation is seriously inadequate, so you don’t need to report this. +

+

If the online argument list of a function disagrees with the manual, one +of them must be wrong, so report the bug. +

+
+ +
+

+Previous: , Up: R Bugs   [Contents]

+
+ +

9.2 How to report a bug

+ +

When you decide that there is a bug, it is important to report it and to +report it in a way which is useful. What is most useful is an exact +description of what commands you type, starting with the shell command +to run R, until the problem happens. Always include the version of R, +machine, and operating system that you are using; type version in +R to print this. +

+

The most important principle in reporting a bug is to report +facts, not hypotheses or categorizations. It is always easier to +report the facts, but people seem to prefer to strain to posit +explanations and report them instead. If the explanations are based on +guesses about how R is implemented, they will be useless; others will +have to try to figure out what the facts must have been to lead to such +speculations. Sometimes this is impossible. But in any case, it is +unnecessary work for the ones trying to fix the problem. +

+

For example, suppose that on a data set which you know to be quite large +the command +

+
+
R> data.frame(x, y, z, monday, tuesday)
+
+ +

never returns. Do not report that data.frame() fails for large +data sets. Perhaps it fails when a variable name is a day of the week. +If this is so then when others got your report they would try out the +data.frame() command on a large data set, probably with no day of +the week variable name, and not see any problem. There is no way in the +world that others could guess that they should try a day of the week +variable name. +

+

Or perhaps the command fails because the last command you used was a +method for "["() that had a bug causing R’s internal data +structures to be corrupted and making the data.frame() command +fail from then on. This is why others need to know what other commands +you have typed (or read from your startup file). +

+

It is very useful to try and find simple examples that produce +apparently the same bug, and somewhat useful to find simple examples +that might be expected to produce the bug but actually do not. If you +want to debug the problem and find exactly what caused it, that is +wonderful. You should still report the facts as well as any +explanations or solutions. Please include an example that reproduces +(e.g., https://en.wikipedia.org/wiki/Reproducibility) the problem, +preferably the simplest one you have found. +

+

Invoking R with the --vanilla option may help in isolating a +bug. This ensures that the site profile and saved data files are not +read. +

+

Before you actually submit a bug report, you should check whether the +bug has already been reported and/or fixed. First, try the “Show open +bugs new-to-old” or the search facility on +https://bugs.R-project.org/. Second, consult +https://svn.R-project.org/R/trunk/doc/NEWS.Rd, which +records changes that will appear in the next release of R, +including bug fixes that do not appear on the Bug Tracker. +Third, if possible try the current r-patched or r-devel version of R. +If a bug has already been reported or fixed, please do not submit +further bug reports on it. Finally, check carefully whether the bug is +with R, or a contributed package. Bug reports on contributed packages +should be sent first to the package maintainer, and only submitted to +the R-bugs repository by package maintainers, mentioning the package in +the subject line. +

+

A bug report can be generated using the function bug.report(). +For reports on R this will open the Web page at +https://bugs.R-project.org/: for a contributed package it will open +the package’s bug tracker Web page or help you compose an email to the +maintainer. +

+

There is a section of the bug repository for suggestions for +enhancements for R labelled ‘wishlist’. Suggestions can be +submitted in the same ways as bugs, but please ensure that the subject +line makes clear that this is for the wishlist and not a bug report, for +example by starting with ‘Wishlist:’. +

+

Comments on and suggestions for the Windows port of R should be sent to +R-windows@R-project.org. +

+

Corrections to and comments on message translations should be sent to the +last translator (listed at the top of the appropriate ‘.po’ file) +or to the translation team as listed at +https://developer.R-project.org/TranslationTeams.html. +

+
+ +
+

+Previous: , Up: Top   [Contents]

+
+ +

10 Acknowledgments

+ +

Of course, many many thanks to Robert and Ross for the R system, and to +the package writers and porters for adding to it. +

+

Special thanks go to Doug Bates, Peter Dalgaard, Paul Gilbert, Stefano +Iacus, Fritz Leisch, Jim Lindsey, Thomas Lumley, Martin Maechler, Brian +D. Ripley, Anthony Rossini, and Andreas Weingessel for their comments +which helped me improve this FAQ. +

+

More to come soon … +

+
+ + + + + diff --git a/R-admin.html b/R-admin.html new file mode 100644 index 0000000..0f89d31 --- /dev/null +++ b/R-admin.html @@ -0,0 +1,7337 @@ + + + + + +R Installation and Administration + + + + + + + + + + + + + + + + +

R Installation and Administration

+ + + + + + + + + + + + + + + + + + + + + + +

Table of Contents

+ +
+ + +
+ + + +
+

+Next:   [Contents][Index]

+
+ +

R Installation and Administration

+ +

This is a guide to installation and administration for R. +

+

This manual is for R, version 3.2.3 (2015-12-10). +

+

Copyright © 2001–2015 R Core Team +

+
+

Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. +

+

Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided that the +entire resulting derived work is distributed under the terms of a +permission notice identical to this one. +

+

Permission is granted to copy and distribute translations of this manual +into another language, under the above conditions for modified versions, +except that this permission notice may be stated in a translation +approved by the R Core Team. +

+ + + + + + + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

1 Obtaining R

+ + +

Sources, binaries and documentation for R can be obtained via +CRAN, the “Comprehensive R Archive Network” whose current +members are listed at https://CRAN.R-project.org/mirrors.html. +

+ + + + + +
+ + + +

1.1 Getting and unpacking the sources

+ + +

The simplest way is to download the most recent +R-x.y.z.tar.gz file, and unpack it with +

+
+
tar -xf R-x.y.z.tar.gz
+
+ +

on systems that have a suitable1 tar installed. On other systems you need to +have the gzip program installed, when you can use +

+
+
gzip -dc R-x.y.z.tar.gz | tar -xf -
+
+ +

The pathname of the directory into which the sources are unpacked should +not contain spaces, as most make programs (and specifically +GNU make) do not expect spaces. +

+

If you want the build to be usable by a group of users, set umask +before unpacking so that the files will be readable by the target group +(e.g., umask 022 to be usable by all users). Keep this +setting of umask whilst building and installing. +

+

If you use a recent GNU version of tar and do this +as a root account (which on Windows includes accounts with administrator +privileges) you may see many warnings about changing ownership. In +which case you can use +

+
+
tar --no-same-owner -xf R-x.y.z.tar.gz
+
+ +

and perhaps also include the option --no-same-permissions. + +(These options can also be set in the TAR_OPTIONS environment +variable: if more than one option is included they should be separated +by spaces.) +

+ + +
+ + + +

1.2 Getting patched and development versions

+ +

A patched version of the current release, ‘r-patched’, and the +current development version, ‘r-devel’, are available as daily +tarballs and via access to the R Subversion repository. (For the two +weeks prior to the release of a minor (3.x.0) version, ‘r-patched’ +tarballs may refer to beta/release candidates of the upcoming release, +the patched version of the current release being available via +Subversion.) +

+

The tarballs are available from +https://stat.ethz.ch/R/daily. Download +R-patched.tar.gz or R-devel.tar.gz (or the .tar.bz2 +versions) and unpack as described in the previous section. They are +built in exactly the same way as distributions of R releases. +

+ + + + +
+ + + +

1.2.1 Using Subversion and rsync

+ + +

Sources are also available via https://svn.R-project.org/R/, the +R Subversion repository. If you have a Subversion client (see +https://subversion.apache.org/), you can check out and update the +current ‘r-devel’ from +https://svn.r-project.org/R/trunk/ and the current +‘r-patched’ from +‘https://svn.r-project.org/R/branches/R-x-y-branch/’ +(where x and y are the major and minor number of the current +released version of R). E.g., use +

+
+
svn checkout https://svn.r-project.org/R/trunk/ path
+
+ +

to check out ‘r-devel’ into directory path (which will be +created if necessary). The alpha, beta and RC versions of an upcoming +x.y.0 release are available from +‘https://svn.r-project.org/R/branches/R-x-y-branch/’ in +the four-week period prior to the release. +

+

Note that ‘https:’ is required2, +and that the SSL certificate for the Subversion server of the R +project should be recognized as from a trusted source. +

+

Note that retrieving the sources by e.g. wget -r or +svn export from that URL will not work (and will give a error +early in the make process): the Subversion information is +needed to build R. +

+

The Subversion repository does not contain the current sources for the +recommended packages, which can be obtained by rsync or +downloaded from CRAN. To use rsync to install the +appropriate sources for the recommended packages, run +./tools/rsync-recommended from the top-level directory of the +R sources. +

+

If downloading manually from CRAN, do ensure that you have the +correct versions of the recommended packages: if the number in the file +VERSION is ‘x.y.z’ you need to download +the contents of ‘https://CRAN.R-project.org/src/contrib/dir’, +where dir is ‘x.y.z/Recommended’ for +r-devel or x.y-patched/Recommended for r-patched, +respectively, to directory src/library/Recommended in the sources +you have unpacked. After downloading manually you need to execute +tools/link-recommended from the top level of the sources to +make the requisite links in src/library/Recommended. A suitable +incantation from the top level of the R sources using wget +might be (for the correct value of dir) +

+
+
wget -r -l1 --no-parent -A\*.gz -nd -P src/library/Recommended \
+  https://CRAN.R-project.org/src/contrib/dir
+./tools/link-recommended
+
+ + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

2 Installing R under Unix-alikes

+ + +

R will configure and build under most common Unix and Unix-alike +platforms including ‘cpu-*-linux-gnu’ for the +‘alpha’, ‘arm’, ‘hppa’, ‘ix86’, +‘m68k’, ‘mips’, ‘mipsel’, ‘powerpc’, +‘s390’, ‘sparc’, and ‘x86_64CPUs, +‘x86_64-apple-darwin’, ‘i386-sun-solaris’ and +‘sparc-sun-solaris’ as well as +perhaps (it is tested less frequently on these platforms) +‘i386-apple-darwin’, ‘i386-*-freebsd’, ‘x86_64-*-freebsd’, +‘i386-*-netbsd’, ‘x86_64/*-openbsd’ and +‘powerpc-ibm-aix6*’ +

+ + +

In addition, binary distributions are available for some common Linux +distributions and for OS X (formerly Mac OS). See the FAQ for +current details. These are installed in platform-specific ways, so for +the rest of this chapter we consider only building from the sources. +

+

Cross-building is not possible: installing R builds a minimal version +of R and then runs many R scripts to complete the build. +

+ + + + + + + + + + + +
+ + + +

2.1 Simple compilation

+ +

First review the essential and useful tools and libraries in +Essential and useful other programs under a Unix-alike, and install +those you + +want or need. Ensure that the environment variable TMPDIR is +either unset (and /tmp exists and can be written in and scripts +can be executed from) or points to the absolute path to a valid +temporary directory (one from which execution of scripts is allowed) +which does not contain spaces.3 +

+ +

Choose a directory to install the R tree (R is not just a binary, but +has additional data sets, help files, font metrics etc). Let us call +this place R_HOME. Untar the source code. This should create +directories src, doc, and several more under a top-level +directory: change to that top-level directory (At this point North +American readers should consult Setting paper size.) Issue the +following commands: +

+ + +
+
./configure
+make
+
+ +

(See Using make if your make is not called ‘make’.) Users of +Debian-based 64-bit systems4 may need +

+
+
./configure LIBnn=lib
+make
+
+ + +

Then check the built system works correctly by +

+
+
make check
+
+ +

Failures are not necessarily problems as they might be caused by missing +functionality, but you should look carefully at any reported +discrepancies. (Some non-fatal errors are expected in locales that do +not support Latin-1, in particular in true C locales and +non-UTF-8 non-Western-European locales.) A failure in +tests/ok-errors.R may indicate inadequate resource limits +(see Running R). +

+

More comprehensive testing can be done by +

+
+
make check-devel
+
+ +

or +

+
+
make check-all
+
+ +

see file tests/README and Testing a Unix-alike Installation +for the possibilities of doing this in parallel. Note that these checks +are only run completely if the recommended packages are installed. +

+

If the command configure and make commands execute +successfully, a shell-script front-end called R will be created +and copied to R_HOME/bin. You can link or copy this script +to a place where users can invoke it, for example to +/usr/local/bin/R. You could also copy the man page R.1 to +a place where your man reader finds it, such as +/usr/local/man/man1. If you want to install the complete R +tree to, e.g., /usr/local/lib/R, see Installation. Note: +you do not need to install R: you can run it from where it was +built. +

+

You do not necessarily have to build R in the top-level source +directory (say, TOP_SRCDIR). To build in +BUILDDIR, run +

+ + +
+
cd BUILDDIR
+TOP_SRCDIR/configure
+make
+
+ +

and so on, as described further below. This has the advantage of always +keeping your source tree clean and is particularly recommended when you +work with a version of R from Subversion. (You may need +GNU make to allow this, and you will need no spaces +in the path to the build directory. It is unlikely to work if the +source directory has previously been used for a build.) +

+ + + +

Now rehash if necessary, type R, and read the R manuals +and the R FAQ (files FAQ or +doc/manual/R-FAQ.html, or +https://CRAN.R-project.org/doc/FAQ/R-FAQ.html which always +has the version for the latest release of R). +

+

Note: if you already have R installed, check that where you installed +R replaces or comes earlier in your path than the previous +installation. Some systems are set up to have /usr/bin (the +standard place for a system installation) ahead of /usr/local/bin +(the default place for installation of R) in their default path, and +some do not have /usr/local/bin on the default path. +

+
+ + + +

2.2 Help options

+ +

By default HTML help pages are created when needed rather than being +built at install time. +

+

If you need to disable the server and want HTML help, there is the +option to build HTML pages when packages are installed +(including those installed with R). This is enabled by the +configure option --enable-prebuilt-html. Whether +R CMD INSTALL (and hence install.packages) pre-builds +HTML pages is determined by looking at the R installation and is +reported by R CMD INSTALL --help: it can be overridden by +specifying one of the INSTALL options --html or +--no-html. +

+

The server is disabled by setting the environment variable + +R_DISABLE_HTTPD to a non-empty value, either before R is +started or within the R session before HTML help (including +help.start) is used. It is also possible that system security +measures will prevent the server from being started, for example if the +loopback interface has been disabled. See +?tools::startDynamicHelp for more details. +

+
+ + + +

2.3 Making the manuals

+ + +

There is a set of manuals that can be built from the sources, +

+
+
fullrefman
+

Printed versions of all the help pages for base and recommended packages +(around 3500 pages). +

+
refman
+

Printed versions of the help pages for selected base packages (around +2000 pages) +

+
R-FAQ
+

R FAQ +

+
R-intro
+

“An Introduction to R”. +

+
R-data
+

“R Data Import/Export”. +

+
R-admin
+

“R Installation and Administration”, this manual. +

+
R-exts
+

“Writing R Extensions”. +

+
R-lang
+

“The R Language Definition”. +

+
+ +

To make these (with ‘fullrefman’ rather than ‘refman’), use +

+
+
make pdf      to create PDF versions
+make info     to create info files (not ‘refman’ nor ‘fullrefman’).
+
+ +

You will not be able to build any of these unless you have +texi2any version 5.1 or later installed, and for PDF you must +have texi2dvi and texinfo.tex installed (which are part +of the GNU texinfo distribution but are, especially +texinfo.tex, often made part of the TeX package in +re-distributions). For historical reasons, the path to +texi2any can be set by macro ‘MAKEINFO’ in +config.site (makeinfo is nowadays a link to +texi2any). +

+

The PDF versions can be viewed using any recent PDF viewer: they have +hyperlinks that can be followed. The info files are suitable for +reading online with Emacs or the standalone GNU info +program. The PDF versions will be created using the paper size selected +at configuration (default ISO a4): this can be overridden by setting +R_PAPERSIZE + +on the make command line, or setting R_PAPERSIZE in the +environment and using make -e. (If re-making the manuals for +a different paper size, you should first delete the file +doc/manual/version.texi. The usual value for North America would +be ‘letter’.) +

+

There are some issues with making the PDF reference manual, +fullrefman.pdf or refman.pdf. The help files contain both +ISO Latin1 characters (e.g. in text.Rd) and upright quotes, +neither of which are contained in the standard LaTeX Computer Modern +fonts. We have provided four alternatives: +

+
+
times
+

(The default.) Using standard PostScript fonts, Times Roman, Helvetica +and Courier. This works well both for on-screen viewing and for +printing. One disadvantage is that the Usage and Examples sections may +come out rather wide: this can be overcome by using in addition +either of the options inconsolata (on a Unix-alike only if found +by configure) or beramono, which replace the Courier +monospaced font by Inconsolata or Bera Sans mono respectively. (You +will need a recent version of the appropriate LaTeX package +inconsolata5 or +bera installed.) +

+

Note that in most LaTeX installations this will not actually use the +standard fonts for PDF, but rather embed the URW clones NimbusRom, +NimbusSans and (for Courier, if used) NimbusMon. +

+

This needs LaTeX packages times, helvetic and (if used) +courier installed. +

+
+
lm
+

Using the Latin Modern fonts. These are not often installed as +part of a TeX distribution, but can obtained from +https://www.ctan.org/tex-archive/fonts/ps-type1/lm/ and +mirrors. This uses fonts rather similar to Computer Modern, but is not +so good on-screen as times. +

+
+
cm-super
+

Using type-1 versions of the Computer Modern fonts by Vladimir Volovich. +This is a large installation, obtainable from +https://www.ctan.org/tex-archive/fonts/ps-type1/cm-super/ +and its mirrors. These type-1 fonts have poor hinting and so are +nowhere near as readable on-screen as the other three options. +

+
+
ae
+

A package to use composites of Computer Modern fonts. This works well +most of the time, and its PDF is more readable on-screen than the +previous two options. There are three fonts for which it will need to +use bitmapped fonts, tctt0900.600pk, tctt1000.600pk and +tcrm1000.600pk. Unfortunately, if those files are not available, +Acrobat Reader will substitute completely incorrect glyphs so you need +to examine the logs carefully. +

+
+ +

The default can be overridden by setting the environment variable + +R_RD4PDF. (On Unix-alikes, this will be picked up at install time +and stored in etc/Renviron, but can still be overridden when the +manuals are built, using make -e.) The usual6 default value for R_RD4PDF is +‘times,inconsolata,hyper’: omit ‘hyper’ if you do not want +hyperlinks (e.g. for printing the manual) or do not have LaTeX +package hyperref, and omit ‘inconsolata’ if you do not have +LaTeX package inconsolata installed. +

+

Further options, e.g for hyperref, can be included in a file +Rd.cfg somewhere on your LaTeX search path. For example, if +you prefer the text and not the page number in the table of contents to +be hyperlinked use +

+
\ifthenelse{\boolean{Rd@use@hyper}}{\hypersetup{linktoc=section}}{}
+
+ +

or +

+
\ifthenelse{\boolean{Rd@use@hyper}}{\hypersetup{linktoc=all}}{}
+
+ +

to hyperlink both text and page number. +

+

Ebook versions of most of the manuals in one or both of .epub and +.mobi formats can be made by running in doc/manual one of +

+
make ebooks
+make epub
+make mobi
+
+ +

This requires ebook-convert from Calibre +(http://calibre-ebook.com/download), or from most Linux +distributions). If necessary the path to ebook-convert can be +set as make macro EBOOK to by editing doc/manual/Makefile +(which contains a commented value suitable for OS X). +

+ +
+ + + +

2.4 Installation

+ + +

To ensure that the installed tree is usable by the right group of users, +set umask appropriately (perhaps to ‘022’) before unpacking +the sources and throughout the build process. +

+

After +

+ +
+
./configure
+make
+make check
+
+ +

(or, when building outside the source, +TOP_SRCDIR/configure, etc) have been completed +successfully, you can install the complete R tree to your system by +typing +

+
+
make install
+
+ +

A parallel make can be used (but run make before make +install). Those using GNU make 4.0 or later may want to use +make -j n -O to avoid interleaving of output. +

+

This will install to the following directories: +

+
+
prefix/bin or bindir
+

the front-end shell script and other scripts and executables +

+
prefix/man/man1 or mandir/man1
+

the man page +

+
prefix/LIBnn/R or libdir/R
+

all the rest (libraries, on-line help system, …). Here +LIBnn is usually ‘lib’, but may be ‘lib64’ on some +64-bit Linux systems. This is known as the R home directory. +

+
+ +

where prefix is determined during configuration (typically +/usr/local) and can be set by running configure with +the option --prefix, as in +

+ +
+
./configure --prefix=/where/you/want/R/to/go
+
+ +

where the value should be an absolute path. This causes make +install to install the R script to +/where/you/want/R/to/go/bin, and so on. The prefix of the +installation directories can be seen in the status message that is +displayed at the end of configure. The installation may need +to be done by the owner of prefix, often a root account. +

+

You can install into another directory tree by using +

+
+
make prefix=/path/to/here install
+
+ +

at least with GNU or Solaris make (but not some +older Unix makes). +

+

More precise control is available at configure time via options: see +configure --help for details. (However, most of the ‘Fine +tuning of the installation directories’ options are not used by R.) +

+

Configure options --bindir and --mandir are supported +and govern where a copy of the R script and the man +page are installed. +

+

The configure option --libdir controls where the main R +files are installed: the default is ‘eprefix/LIBnn’, +where eprefix is the prefix used for installing +architecture-dependent files, defaults to prefix, and can be set +via the configure option --exec-prefix. +

+

Each of bindir, mandir and libdir can also be +specified on the make install command line (at least for +GNU make). +

+

The configure or make variables rdocdir and +rsharedir can be used to install the system-independent +doc and share directories to somewhere other than +libdir. The C header files can be installed to the value of +rincludedir: note that as the headers are not installed into a +subdirectory you probably want something like +rincludedir=/usr/local/include/R-3.2.3. +

+

If you want the R home to be something other than +libdir/R, use rhome: for example +

+
+
make install rhome=/usr/local/lib64/R-3.2.3
+
+ +

will use a version-specific R home on a non-Debian Linux 64-bit +system. +

+

If you have made R as a shared/static library you can install it in +your system’s library directory by +

+
+
make prefix=/path/to/here install-libR
+
+ +

where prefix is optional, and libdir will give more +precise control.7 However, you should not install +to a directory mentioned in LDPATHS (e.g. +/usr/local/lib64) if you intend to work with multiple versions of +R, since that directory may be given precedence over the lib +directory of other R installations. +

+
+
make install-strip
+
+ +

will install stripped executables, and on platforms where this is +supported, stripped libraries in directories lib and +modules and in the standard packages. +

+

Note that installing R into a directory whose path contains spaces is +not supported, and some aspects (such as installing source packages) +will not work. +

+ + +

To install info and PDF versions of the manuals, use one or both of +

+
+
make install-info
+make install-pdf
+
+ +

Once again, it is optional to specify prefix, libdir or +rhome (the PDF manuals are installed under the R home +directory). (make install-info needs Perl installed +if there is no command install-info on the system.) +

+

More precise control is possible. For info, the setting used is that of +infodir (default prefix/info, set by configure +option --infodir). The PDF files are installed into the R +doc tree, set by the make variable rdocdir. +

+

A staged installation is possible, that it is installing R into a +temporary directory in order to move the installed tree to its final +destination. In this case prefix (and so on) should reflect the + +final destination, and DESTDIR should be used: see +https://www.gnu.org/prep/standards/html_node/DESTDIR.html. +

+

You can optionally install the run-time tests that are part of +make check-all by +

+
+
make install-tests
+
+ +

which populates a tests directory in the installation. +

+ +
+ + + +

2.5 Uninstallation

+ +

You can uninstall R by +

+
+
make uninstall
+
+ +

optionally specifying prefix etc in the same way as specified for +installation. +

+

This will also uninstall any installed manuals. There are specific +targets to uninstall info and PDF manuals in file +doc/manual/Makefile. +

+

Target uninstall-tests will uninstall any installed tests, as +well as removing the directory tests containing the test results. +

+

An installed shared/static libR can be uninstalled by +

+
+
make prefix=/path/to/here uninstall-libR
+
+ + +
+ + + +

2.6 Sub-architectures

+ +

Some platforms can support closely related builds of R which can +share all but the executables and dynamic objects. Examples include +builds under Linux and Solaris for different CPUs or 32- and +64-bit builds. +

+

R supports the idea of architecture-specific builds, specified by +adding ‘r_arch=name’ to the configure line. Here +name can be anything non-empty, and is used to name subdirectories +of lib, etc, include and the package libs +subdirectories. Example names from other software are the use of +sparcv9 on Sparc Solaris and 32 by gcc on +‘x86_64’ Linux. +

+

If you have two or more such builds you can install them over each other +(and for 32/64-bit builds on one architecture, one build can be done +without ‘r_arch’). The space savings can be considerable: on +‘x86_64’ Linux a basic install (without debugging symbols) took +74Mb, and adding a 32-bit build added 6Mb. If you have installed +multiple builds you can select which build to run by +

+
+
R --arch=name
+
+ +

and just running ‘R’ will run the last build that was installed. +

+

R CMD INSTALL will detect if more than one build is installed and +try to install packages with the appropriate library objects for each. +This will not be done if the package has an executable configure +script or a src/Makefile file. In such cases you can install for +extra builds by +

+
+
R --arch=name CMD INSTALL --libs-only pkg1 pkg2 …
+
+ +

If you want to mix sub-architectures compiled on different platforms +(for example ‘x86_64’ Linux and ‘i686’ Linux), it is +wise to use explicit names for each, and you may also need to set +libdir to ensure that they install into the same place. +

+

When sub-architectures are used the version of Rscript in +e.g. /usr/bin will be the last installed, but +architecture-specific versions will be available in e.g. +/usr/lib64/R/bin/exec${R_ARCH}. Normally all installed +architectures will run on the platform so the architecture of +Rscript itself does not matter. The executable +Rscript will run the R script, and at that time the + +setting of the R_ARCH environment variable determines the +architecture which is run. +

+

When running post-install tests with sub-architectures, use +

+
+
R --arch=name CMD make check[-devel|all]
+
+ +

to select a sub-architecture to check. +

+

Sub-architectures are also used on Windows, but by selecting executables +within the appropriate bin directory, +R_HOME/bin/i386 or R_HOME/bin/x64. For +backwards compatibility with R < 2.12.0, there are executables +R_HOME/bin/R.exe or R_HOME/bin/Rscript.exe: +these will run an executable from one of the subdirectories, which one +being taken first from the + +R_ARCH environment variable, then from the +--arch command-line option8 and finally from the +installation default (which is 32-bit for a combined 32/64 bit R +installation). +

+ + + + +
+ + + +

2.6.1 Multilib

+ +

On Linux9, there is an alternative mechanism for mixing 32-bit and 64-bit +libraries known as multilib. If a Linux distribution supports +multilib, then parallel builds of R may be installed in the +sub-directories lib (32-bit) and lib64 (64-bit). The +build to be run may then be selected using the setarch +command. For example, a 32-bit build may be run by +

+
+
setarch i686 R
+
+ +

The setarch command is only operational if both 32-bit and +64-bit builds are installed. If there is only one installation of R, +then this will always be run regardless of the architecture specified +by the setarch command. +

+

There can be problems with installing packages on the non-native +architecture. It is a good idea to run e.g. setarch i686 R for +sessions in which packages are to be installed, even if that is the only +version of R installed (since this tells the package installation +code the architecture needed). +

+

At present there is a potential problem with packages using Java, as +the post-install for a ‘i686’ RPM on ‘x86_64’ Linux +reconfigures Java and will find the ‘x86_64’ Java. If you know +where a 32-bit Java is installed you may be able to run (as root) +

+
+
export JAVA_HOME=<path to jre directory of 32-bit Java>
+setarch i686 R CMD javareconf
+
+ +

to get a suitable setting. +

+

When this mechanism is used, the version of Rscript in +e.g. /usr/bin will be the last installed, but an +architecture-specific version will be available in +e.g. /usr/lib64/R/bin. Normally all installed architectures +will run on the platform so the architecture of Rscript does +not matter. +

+
+ + + +

2.7 Other Options

+ +

There are many other installation options, most of which are listed by +configure --help. Almost all of those not listed elsewhere in +this manual are either standard autoconf options not relevant +to R or intended for specialist uses by the R developers. +

+

One that may be useful when working on R itself is the option +--disable-byte-compiled-packages, which ensures that the base +and recommended packages are lazyloaded but not byte-compiled. +(Alternatively the (make or environment) variable +R_NO_BASE_COMPILE can be set to a non-empty value for the duration +of the build.) +

+

Option --with-internal-tzcode makes use of R’s own code and +copy of the Olson database for managing timezones. This will be +preferred where there are issues with the system implementation, usually +involving times after 2037 or before 1916. An alternative time-zone +directory10 can be used, pointed +to by environment variable TZDIR: this should contain files such +as Europe/London. On all tested OSes the system timezone was +deduced correctly, but if necessary it can be set as the value of +environment variable TZ. +

+ + + + + +
+ +
+

+Previous: , Up: Other Options   [Contents][Index]

+
+ +

2.7.1 OpenMP Support

+ +

By default configure searches for suitable +options11 for OpenMP support for the C, C++98, FORTRAN +77 and Fortran compilers. +

+

Only the C result is currently used for R itself, and only if +MAIN_LD/DYLIB_LD were not specified. This can be +overridden by specifying +

+
R_OPENMP_CFLAGS
+
+ +

Use for packages has similar restrictions (involving SHLIB_LD and +similar: note that as FORTRAN 77 code is normally linked by the C +compiler, both need to support OpenMP) and can be overridden by +specifying some of +

+
SHLIB_OPENMP_CFLAGS
+SHLIB_OPENMP_CXXFLAGS
+SHLIB_OPENMP_FCFLAGS
+SHLIB_OPENMP_FFLAGS
+
+ +

Setting to an empty value will disable OpenMP for that compiler (and +configuring with --disable-openmp will disable all detection of +OpenMP). Note that the configure detection test is to compile +and link a standalone OpenMP program, which is not the same as compiling +a shared object and loading it into the C program of R’s executable. +Note that overridden values are not tested. +

+
+ + + +

2.8 Testing an Installation

+ +

Full testing is possible only if the test files have been installed with +

+
+
make install-tests
+
+ +

which populates a tests directory in the installation. +

+

If this has been done, two testing routes are available. +The first is to move to the home directory of the R installation +(as given by R.home()) and run +

+
+
cd tests
+## followed by one of
+../bin/R CMD make check
+../bin/R CMD make check-devel
+../bin/R CMD make check-all
+
+ +

and other useful targets are test-BasePackages and +test-Recommended to the run tests of the standard and +recommended packages (if installed) respectively. +

+

This re-runs all the tests relevant to the installed R (including for +example code in the package vignettes), but not for example the ones +checking the example code in the manuals nor making the standalone Rmath +library. This can occasionally be useful when the operating environment +has been changed, for example by OS updates or by substituting the +BLAS (see Shared BLAS). +

+

Parallel checking of packages may be possible: set the environment +variable TEST_MC_CORES to the maximum number of processes to be +run in parallel. This affects both checking the package examples (part +of make check) and package sources (part of make +check-devel and make check-recommended). It does require a +make command which supports the make -j n +option: most do but on Solaris you need to select GNU make or +dmake. Where parallel checking of package sources is done, a log +file pngname.log is left in the tests directory for +inspection. +

+ +

Alternatively, the installed R can be run, preferably with +--vanilla. Then + +

+
+
Sys.setenv(LC_COLLATE = "C", LC_TIME = "C", LANGUAGE = "en")
+library("tools")
+testInstalledBasic("both")
+testInstalledPackages(scope = "base")
+testInstalledPackages(scope = "recommended")
+
+ +

runs the basic tests and then all the tests on the standard and +recommended packages. These tests can be run from anywhere: the basic +tests write their results in the tests folder of the R home +directory and run fewer tests than the first approach: in particular +they do not test things which need Internet access—that can be tested +by +

+
testInstalledBasic("internet")
+
+ +

(On Windows that runs the tests using whichever of internal or WinInet +internet functions has been selected for that session: to test both run +this twice selecting both options using setInternet2.) +

+

These tests work best if diff (in Rtools*.exe for +Windows users) is in the path. +

+

It is possible to test the installed packages (but not their +package-specific tests) by testInstalledPackages even if +make install-tests was not run. +

+

Note that the results may depend on the language set for times and +messages: for maximal similarity to reference results you may want to +try setting (before starting the R session) +

+
+
LANGUAGE=en
+
+ +

and use a UTF-8 or Latin-1 locale. +

+ +
+ + + +

3 Installing R under Windows

+ + +

The bin/windows directory of a CRAN site contains +binaries for a base distribution and a large number of add-on packages +from CRAN to run on 32- or 64-bit Windows (XP or later) on +‘ix86’ and ‘x86_64CPUs. +

+

Your file system must allow long file names (as is likely except +perhaps for some network-mounted systems). +

+

Installation is via the installer +R-3.2.3-win.exe. Just double-click on the icon and +follow the instructions. When installing on a 64-bit version of Windows +the options will include 32- or 64-bit versions of R (and the default is +to install both). You can uninstall R from the Control Panel. +

+

Note that you will be asked to choose a language for installation, and +that choice applies to both installation and un-installation but not to +running R itself. +

+

See the R +Windows FAQ for more details on the binary installer. +

+ + + + + +
+ + + +

3.1 Building from source

+ +

R can be built as either a 32-bit or 64-bit application on Windows: +to build the 64-bit application you need a 64-bit edition of Windows: +such an OS can also be used to build 32-bit R. +

+

The standard installer combines 32-bit and 64-bit builds into a single +executable which can then be installed into the same location and share +all the files except the .exe and .dll files and some +configuration files in the etc directory. +

+

Building is only tested in a 8-bit locale: using a multi-byte locale (as +used for CJK languages) is unsupported and may not work (the scripts do +try to select a ‘C’ locale; Windows may not honour this). +

+

NB: The build process is currently being changed to require +external binary distributions of third-party software. Their location +is set using macro EXT_LIBS with default setting +$(LOCAL_SOFT); the $(LOCAL_SOFT) macro defaults to +$(R_HOME)/extsoft. This directory can be populated using +make rsync-extsoft. The location can be overridden by +setting EXT_LIBS to a different path in +src/gnuwin32/MkRules.local. A suitable collection of files can +also be obtained from +https://CRAN.R-project.org/bin/windows/extsoft or +https://www.stats.ox.ac.uk/pub/Rtools/libs.html. +

+ + + + + + + + + + + + + + +
+ + + +

3.1.1 Getting the tools

+ +

If you want to build R from the sources, you will first need to +collect, install and test an extensive set of tools. See The Windows toolset (and perhaps updates in +https://CRAN.R-project.org/bin/windows/Rtools/) for details. +

+

The Rtools*.exe executable installer described in The Windows toolset also includes some source files in addition to the R +source as noted below. You should run it first, to obtain a working +tar and other necessities. Choose a “Full installation”, and +install the extra files into your intended R source directory, e.g. +C:/R. The directory name should not contain spaces. We +will call this directory R_HOME below. +

+
+ + + +

3.1.2 Getting the source files

+ +

You need to collect the following sets of files: +

    +
  • Get the R source code tarball R-3.2.3.tar.gz from +CRAN. Open a command window (or another shell) at directory +R_HOME, and run + +
    +
    tar -xf R-3.2.3.tar.gz
    +
    + +

    to create the source tree in R_HOME. Beware: do use +tar to extract the sources rather than tools such as WinZip. +If you are using an account with administrative privileges you may get a +lot of messages which can be suppressed by +

    +
    +
    tar --no-same-owner -xf R-3.2.3.tar.gz
    +
    + + +

    or perhaps better, set the environment variable TAR_OPTIONS to the +value ‘--no-same-owner --no-same-permissions’. +

    +

    It is also possible to obtain the source code using Subversion; see +Obtaining R for details. +

    + +
  • If you are not using a tarball you need to obtain copies of the +recommended packages from CRAN. Put the .tar.gz files +in R_HOME/src/library/Recommended and run make +link-recommended. If you have an Internet connection, you can do this +automatically by running in R_HOME/src/gnuwin32 + +
    +
    make rsync-recommended
    +
    + +
  • The binary distributions of external software. Download + +
    +
    https://www.stats.ox.ac.uk/pub/Rtools/goodies/multilib/local320.zip
    +
    + +

    create an empty directory, say c:/R/extsoft, and unpack it in +that directory by e.g. +

    +
    +
    unzip local320.zip -d c:/R/extsoft
    +
    + +
  • Make a local copy of the configuration rules by + +
    +
    cd R_HOME/src/gnuwin32
    +cp MkRules.dist MkRules.local
    +
    + +

    and edit MkRules.local, uncommenting EXT_LIBS and setting +it to the appropriate path (in our example c:/R/extsoft). +

    +

    Look through the file MkRules.local and make any other changes +needed: in particular, this is where a 64-bit build is selected and the +locations are set of external software for ICU collation and the +cairo-based devices. +

    +
+ +

The following additional item is normally installed by +Rtools31.exe. If instead you choose to do a completely manual +build you will also need +

+
    +
  • The Tcl/Tk support files are contained in Rtools31.exe and +available as .zip files from +https://www.stats.ox.ac.uk/pub/Rtools. Please make sure you +install the right version: there is a 32-bit version and a 64-bit +version. They should be installed to R_HOME, creating +directory Tcl there. + +
+ +
+ + + +

3.1.3 Building the core files

+ + +

Set the environment variable TMPDIR to the absolute path to a +writable directory, with a path specified with forward slashes and no +spaces. (The default is /tmp, which may not be useful on +Windows.) +

+

You may need to compile under a case-honouring file system: we found +that a samba-mounted file system (which maps all file names to +lower case) did not work. +

+

Open a command window at R_HOME/src/gnuwin32, then run +

+
+
make all recommended vignettes
+
+ +

and sit back and wait while the basic compile takes place. +

+

Notes: +

    +
  • We have had reports that earlier versions of anti-virus software locking +up the machine, but not for several years. However, aggressive +anti-virus checking such as the on-access scanning of Sophos can slow +the build down several-fold. + +
  • You can run a parallel make by e.g. + +
    +
    make -j4 all
    +make -j4 recommended
    +make vignettes
    +
    + +

    but this is only likely to be worthwhile on a multi-core machine with +ample memory, and is not 100% reliable. +

    +
  • It is possible (mainly for those working on R itself) to set the +(make or environment) variable R_NO_BASE_COMPILE to a non-empty +value, which inhibits the byte-compilation of the base and recommended +packages. + +
+ +
+ + + +

3.1.4 Building the cairo devices

+ + +

The devices based on cairographics (svg, cairo_pdf, +cairo_ps and the type = "cairo" versions of png, +jpeg, tiff and bmp) are implemented in a separate +DLL winCairo.dll which is loaded when one of these devices is +first used. It is not built by default, and needs to be built (after +make all) by make cairodevices. +

+

To enable the building of these devices you need to install the static +cairographics libraries built by Simon Urbanek at +https://www.rforge.net/Cairo/files/cairo-current-win.tar.gz. Set +the macro ‘CAIRO_HOME’ in MkRules.local. (Note that this +tarball unpacks with a top-level directory src/: +‘CAIRO_HOME’ needs to include that directory in its path.) +

+
+ + + +

3.1.5 Using ICU for collation

+ +

It is recommended to build R to support ICU (International Components +for Unicode, http://site.icu-project.org/) for collation, as is +commonly done on Unix-alikes. +

+

Two settings are needed in MkRules.local, +

+
# set to use ICU
+# USE_ICU = YES
+# path to parent of ICU headers
+ICU_PATH = /path/to/ICU
+
+ +

The first should be uncommented and the second set to the top-level +directory of a suitably packaged binary build of ICU, for example that +at https://www.stats.ox.ac.uk/pub/Rtools/goodies/ICU_531.zip. +Depending on the build, it may be necessary to edit the macro +ICU_LIBS. +

+

Unlike on a Unix-alike, it is normally necessary to call +icuSetCollate to set a locale before ICU is actually used for +collation, or set the environment variable R_ICU_LOCALE. +

+
+ + + +

3.1.6 Support for libcurl

+ +

libcurl version 7.28.0 or later can be used to support +curlGetHeaders and the "libcurl" methods of +download.file and url. +

+

A suitable distribution can be found via +https://www.stats.ox.ac.uk/pub/Rtools/libs.html and its unpacked +location should be specified in file MkRules.local. +

+

For secure use of e.g. ‘https://’ URLs Windows users may need to +specify the path to up-to-date CA root certificates: see +?download.file. +

+
+ + + +

3.1.7 Checking the build

+ +

You can test a build by running +

+
+
make check
+
+ +

The recommended packages can be checked by +

+
+
make check-recommended
+
+ +

Other levels of checking are +

+
+
make check-devel
+
+ +

for a more thorough check of the R functionality, and +

+
+
make check-all
+
+ +

for both check-devel and check-recommended. +

+

If a test fails, there will almost always be a .Rout.fail file in +the directory being checked (often tests/Examples or +tests): examine the file to help pinpoint the problem. +

+

Parallel checking of package sources (part of make check-devel +and make check-recommended) is possible: see the environment +variable TEST_MC_CORES to the maximum number of processes to be +run in parallel. +

+ +
+ + + +

3.1.8 Building the manuals

+ +

The PDF manuals require texinfo 5.1 or later, and can be made by +

+
+
make manuals
+
+ +

If you want to make the info versions (not including the Reference +Manual), use +

+
+
cd ../../doc/manual
+make -f Makefile.win info
+
+ +

(all assuming you have pdftex/pdflatex installed and +in your path). +

+

See the Making the manuals section in the Unix-alike section for setting +options such as the paper size and the fonts used. +

+

By default it is assumed that texinfo is not installed, and the +manuals will not be built. The comments in file MkRules.dist +describe settings to build them. (Copy that file to +MkRules.local and edit it.) The texinfo 5.x package for +use on Windows is available at +https://www.stats.ox.ac.uk/pub/Rtools/: you will also need to +install Perl12 +

+
+ + + +

3.1.9 Building the Inno Setup installer

+ +

You need to have the files for a complete R build, including bitmap and +Tcl/Tk support and the manuals (which requires texinfo installed), +as well as the recommended packages and Inno Setup (see The Inno Setup installer). +

+

Once everything is set up +

+
+
make distribution
+make check-all
+
+ +

will make all the pieces and the installer and put them in the +gnuwin32/cran subdirectory, then check the build. This works by +building all the parts in the sequence: +

+
+
rbuild (the executables, the FAQ docs etc.)
+rpackages (the base packages)
+htmldocs (the HTML documentation)
+cairodevices (the cairo-based graphics devices)
+recommended (the recommended packages)
+vignettes (the vignettes in base packages:
+	   only needed if building from an svn checkout)
+manuals (the PDF manuals)
+rinstaller (the install program)
+crandir (the CRAN distribution directory, only for 64-bit builds)
+
+ +

The parts can be made individually if a full build is not needed, but +earlier parts must be built before later ones. (The Makefile +doesn’t enforce this dependency—some build targets force a lot of +computation even if all files are up to date.) The first four targets +are the default build if just make (or make all) is +run. +

+

Parallel make is not supported and likely to fail. +

+

If you want to customize the installation by adding extra packages, +replace make rinstaller by something like +

+
+
make rinstaller EXTRA_PKGS='pkg1 pkg2 pkg3'
+
+ + +

An alternative way to customize the installer starting with a binary +distribution is to first make an installation of R from the standard +installer, then add packages and make other customizations to that +installation. Then (after having customized file MkRules, +possibly via MkRules.local, and having made R in the +source tree) in src/gnuwin32/installer run +

+
+
make myR IMAGEDIR=rootdir
+
+ +

where rootdir is the path to the root of the customized +installation (in double quotes if it contains spaces or backslashes). +

+

Both methods create an executable with a standard name such as +R-3.2.3-win.exe, so please rename it to indicate that +it is customized. If you intend to distribute a customized +installer please do check that license requirements are met – note that +the installer will state that the contents are distributed under GPL +and this has a requirement for you to supply the complete sources +(including the R sources even if you started with a binary distribution +of R, and also the sources of any extra packages (including their +external software) which are included). +

+ +

The defaults for the startup parameters may also be customized. For example +

+
+
make myR IMAGEDIR=rootdir MDISDI=1
+
+ +

will create an installer that defaults to installing R to run in SDI +mode. See src/gnuwin32/installer/Makefile for the names and +values that can be set. +

+

The standard CRAN distribution of a 32/64-bit installer is +made by first building 32-bit R (just +

+
+
make 32-bit
+
+ +

is needed), and then (in a separate directory) building 64-bit R with +the macro HOME32 set in file MkRules.local to the +top-level directory of the 32-bit build. Then the make +rinstaller step copies the files that differ between architectures from +the 32-bit build as it builds the installer image. +

+
+ + + +

3.1.10 Building the MSI installer

+ +

It is also possible to build an installer for use with Microsoft +Installer. This is intended for use by sysadmins doing automated +installs, and is not recommended for casual use. +

+

It makes use of the Windows Installer XML (WiX) toolkit version +3.5 (or perhaps later, untested) available from +http://wixtoolset.org/. Once WiX is installed, set the path to +its home directory in MkRules.local. +

+

You need to have the files for a complete R build, including bitmap and +Tcl/Tk support and the manuals, as well as the recommended packages. +There is no option in the installer to customize startup options, so +edit etc/Rconsole and etc/Rprofile.site to set these as +required. Then +

+
+
cd installer
+make msi
+
+ +

which will result in a file with a name like +R-3.2.3-win32.msi. This can be double-clicked to be +installed, but those who need it will know what to do with it (usually +by running msiexec /i with additional options). Properties +that users might want to set from the msiexec command line +include ‘ALLUSERS’, ‘INSTALLDIR’ (something like +c:\Program Files\R\R-3.2.3) and ‘RMENU’ (the path +to the ‘R’ folder on the start menu) and ‘STARTDIR’ (the +starting directory for R shortcuts, defaulting to something like +c:\Users\name\Documents\R). +

+

The MSI installer can be built both from a 32-bit build of R +(R-3.2.3-win32.msi) and from a 64-bit build of R +(R-3.2.3-win64.msi, optionally including 32-bit files +by setting the macro HOME32, when the name is +R-3.2.3-win.msi). Unlike the main installer, a 64-bit +MSI installer can only be run on 64-bit Windows. +

+

Thanks to David del Campo (Dept of Statistics, University of Oxford) +for suggesting WiX and building a prototype installer. +

+
+ + + +

3.1.11 64-bit Windows builds

+ +

To build a 64-bit version of R you need a 64-bit toolchain: the only one +discussed here is based on the work of the MinGW-w64 project +(http://sourceforge.net/projects/mingw-w64/, but commercial +compilers such as those from Intel and PGI could be used (and have been +by R redistributors). +

+

Support for MinGW-w64 was developed in the R sources over the period +2008–10 and was first released as part of R 2.11.0. The assistance +of Yu Gong at a crucial step in porting R to MinGW-w64 is gratefully +acknowledged, as well as help from Kai Tietz, the lead developer of the +MinGW-w64 project. +

+

Windows 64-bit is now completely integrated into the R and package +build systems: a 64-bit build is selected in file MkRules.local. +

+
+ + + +

3.2 Testing an Installation

+ +

The Windows installer contains a set of test files used when building +R. +

+

The Rtools are not needed to run these tests. but more +comprehensive analysis of errors will be given if diff is in +the path (and errorsAreFatal = FALSE is then not needed below). +

+

Launch either Rgui or Rterm, preferably with +--vanilla. Then run +

+
+
Sys.setenv(LC_COLLATE = "C", LANGUAGE = "en")
+library("tools")
+testInstalledBasic("both")
+testInstalledPackages(scope = "base", errorsAreFatal = FALSE)
+testInstalledPackages(scope = "recommended", errorsAreFatal = FALSE)
+
+ +

runs the basic tests and then all the tests on the standard and +recommended packages. These tests can be run from anywhere: they write +some of their results in the tests folder of the R home +directory (as given by R.home()), and hence may need to be run +under the account used to install R. +

+

The results of example(md5sums) when testing tools will +differ from the reference output as some files are installed with +Windows’ CRLF line endings. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

4 Installing R under OS X

+ + + +

The front page of a CRAN site has a link ‘Download R for OS +X’. Click on that, then download the file R-3.2.3.pkg +and install it. This runs on OS X 10.9 and later (Mavericks, Yosemite, +El Capitan13, …). +

+

There may be14 a +separate installer package R-3.2.3-snowleopard.pkg, +which runs on OS X 10.6 and later (Snow Leopard, Lion, Mountain Lion, +Mavericks, Yosemite, …); it is a 64-bit (‘x86_64’) build +which should run on all Macs from mid-2008 on. +

+

Installers for R-patched and R-devel are usually available from +https://r.research.att.com, including a +R-3-2-branch-snowleopard-signed.pkg build for R-patched. +

+

For some older versions of the OS you can in principle (it is little +tested) install R from the sources. +

+

It is important that if you use a binary installer package that your OS +is fully updated: look at ‘Updates’ from the ‘App Store’ to be sure. +(If using XQuartz, check that is current.) +

+

To install, just double-click on the icon of the file you downloaded. +At the ‘Installation Type’ stage, note the option to ‘Customize’. This +currently shows four components: everyone will need the ‘R Framework’ +component: the remaining components are optional. (The ‘Tcl/Tk’ component +is needed to use package tcltk. The ‘Texinfo’ component is only +needed by those installing source packages.) +

+

This is an Apple Installer package. If you encounter any problem during +the installation, please check the Installer log by clicking on the +“Window” menu and item “Installer Log”. The full output (select +“Show All Log”) is useful for tracking down problems. Note the the +installer is clever enough to try to upgrade the last-installed version +of the application where you installed it (which may not be where you +want this time …). +

+

Various parts of the build require XQuartz to be installed: : see +https://xquartz.macosforge.org/. These include the tcltk +package and the X11 device: attempting to use these without +XQuartz will remind you. +

+

If you update your OS X version, you should re-install R (and perhaps +XQuartz): the installer tailors the installation to the current version +of the OS. +

+

For building R from source, see OS X. +

+ + + + + + + +
+ + + +

4.1 Running R under OS X

+ +

There are two ways to run R on OS X from a CRAN binary +distribution. +

+

There is a GUI console normally installed with the R icon in +/Applications which you can run by double-clicking (e.g. from +Launchpad or Finder). (If you cannot find it there it was possibly +installed elsewhere so try searching for it in Spotlight.) This is +usually referred to as R.APP to distinguish it from command-line R: +its user manual is currently part of the OS X FAQ at +https://cran.r-project.org/bin/macosx/RMacOSX-FAQ.html and +can be viewed from R.APP’s ‘Help’ menu. +

+ +

You can run command-line R from a Terminal15 so these +can be typed as commands like any other Unix-alike: see the next chapter +of this manual. There are some small differences which may surprise +users of R on other platforms, notably the default location of the +personal library directory (under ~/Library/R, +e.g. ~/Library/R/3.3/library), and that warnings, messages and +other output to stderr are highlighted in bold. +

+

It has been reported that running R.APP under Yosemite may fail if no +preferences are stored, so if it fails when launched for the very first +time, try it again (the first attempt will store some preferences). +

+

Users of R.APP under Mavericks and later need to be aware of the ‘App +Nap’ feature +(https://developer.apple.com/library/mac/releasenotes/MacOSX/WhatsNewInOSX/Articles/MacOSX10_9.html) +which can cause R tasks to appear to run very slowly when not +producing output in the console. Here are three ways to avoid it: +

+
    +
  • Ensure that the console is completely visible (or at least the activity +indicator at the top right corner is visible). + +
  • Call ‘Get Info’ on the application (e.g. from Finder). This may +have two tick boxes in the ‘General’ panel: click the one named ‘Prevent +App Nap’ if it is not already ticked. (This only available for builds +made prior to Mavericks.) + +
  • In a Terminal, run +
    +
    defaults write org.R-project.R NSAppSleepDisabled -bool YES
    +
    + +

    (see https://developer.apple.com/library/mac/releasenotes/MacOSX/WhatsNewInOSX/Articles/MacOSX10_9.html). +

+ +

Using the X11 device or the X11-based versions of View() +and edit() for data frames and matrices (the latter are the +default for command-line R but not R.APP) requires an X sub-system +to be installed: see OS X. (As do the tcltk package and +some third-party packages.) +

+
+ + + +

4.2 Uninstalling under OS X

+ +

R for OS X consists of two parts: the GUI (R.APP) and the R +framework. The un-installation is as simple as removing those folders +(e.g. by dragging them into the Trash). The typical installation will +install the GUI into the /Applications/R.app folder and the R +framework into the /Library/Frameworks/R.framework folder. The +links to R and Rscript in /usr/bin or +/usr/local/bin should also be removed. +

+

If you want to get rid of R more completely using a Terminal, simply +run (use /usr/local/bin on El Capitan): +

+
+
sudo rm -rf /Library/Frameworks/R.framework /Applications/R.app \
+   /usr/bin/R /usr/bin/Rscript
+
+ +

The installation consists of four Apple packages: +org.r-project.R.x86_64.fw.pkg, +org.r-project.R.x86_64.GUI.pkg, +org.r-project.x86_64.tcltk.x11 and +org.r-project.x86_64.texinfo (not all of which need be +installed). You can use pkgutil --forget if you want the Apple +Installer to forget about the package without deleting its files (useful +for the R framework when installing multiple R versions in parallel), +or after you have deleted the files. +

+

Uninstalling the Tcl/Tk or Texinfo components (which are installed under +/usr/local) is not as simple. You can list the files they installed +in a Terminal by +

+
+
pkgutil --files org.r-project.x86_64.tcltk.x11
+pkgutil --files org.r-project.x86_64.texinfo
+
+ +

These are paths relative to /, the root of the file system. +

+
+ + + +

4.3 Multiple versions

+ +

The installer will remove any previous version of the R framework +which it finds installed. This can be avoided by using pkgutil +--forget (see the previous section). However, note that different +versions are installed under +/Library/Frameworks/R.framework/Versions as 3.2, +3.3 and so on, so it is not possible to have different +‘3.x.y’ versions installed for the same ‘x’. +

+

A version of R can be run directly from the command-line as e.g. +

+
/Library/Frameworks/R.framework/Versions/3.2/Resources/bin/R
+
+ +

However, R.APP will always run the ‘current’ version, that is the last +installed version. A small utility, Rswitch.app (available at +https://r.research.att.com/#other), can be used to change the +‘current’ version. This is of limited use as R.APP is compiled +against a particular version of R and will likely crash if switched +to an earlier version. This may allow you to install a development +version of R (de-selecting R.APP) and then switch back to the +release version. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

5 Running R

+ +

How to start R and what command-line options are available is discussed +in Invoking R in An Introduction to R. +

+

You should ensure that the shell has set adequate resource limits: R +expects a stack size of at least 8MB and to be able to open at least 256 +file descriptors. (Any modern OS should have default limits at least as +large as these, but apparently NetBSD may not. Use the shell command +ulimit (sh/bash) or limit +(csh/tcsh) to check.) +

+

R makes use of a number of environment variables, the default values +of many of which are set in file R_HOME/etc/Renviron (there +are none set by default on Windows and hence no such file). These are +set at configure time, and you would not normally want to + +change them – a possible exception is R_PAPERSIZE (see Setting paper size). The paper size will be deduced from the ‘LC_PAPER’ +locale category if it exists and R_PAPERSIZE is unset, and this +will normally produce the right choice from ‘a4’ and ‘letter’ +on modern Unix-alikes (but can always be overridden by setting +R_PAPERSIZE). +

+

Various environment variables can be set to determine where R creates +its per-session temporary directory. The environment variables + + + +TMPDIR, TMP and TEMP are searched in turn and the +first one which is set and points to a writable area is used. If none +do, the final default is /tmp on Unix-alikes and the value of + +R_USER on Windows. The path should be an absolute path not +containing spaces (and it is best to avoid non-alphanumeric characters +such as +). +

+

Some Unix-alike systems are set up to remove files and directories +periodically from /tmp, for example by a cron job + +running tmpwatch. Set TMPDIR to another directory +before starting long-running jobs on such a system. +

+

Note that TMPDIR will be used to execute configure +scripts when installing packages, so if /tmp has been mounted as +‘noexec’, TMPDIR needs to be set to a directory from which +execution is allowed. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

6 Add-on packages

+ + + + + + + + + + + + + +

It is helpful to use the correct terminology. A package is +loaded from a library by the function library(). Thus a +library is a directory containing installed packages; the main library +is R_HOME/library, but others can be used, for example by + +setting the environment variable R_LIBS or using the R function +.libPaths(). +

+
+ + + +

6.1 Default packages

+ + +

The set of packages loaded on startup is by default +

+
+
> getOption("defaultPackages")
+[1] "datasets"  "utils"     "grDevices" "graphics"  "stats"     "methods"
+
+ +

(plus, of course, base) and this can be changed by setting the +option in startup code (e.g. in ~/.Rprofile). It is initially + +set to the value of the environment variable R_DEFAULT_PACKAGES if +set (as a comma-separated list). Setting R_DEFAULT_PACKAGES=NULL +ensures that only package base is loaded. +

+

Changing the set of default packages is normally used to reduce the set +for speed when scripting: in particular not using methods will +reduce the start-up time by a factor of up to two (and this is done by +Rscript). But it can also be used to customize R, e.g. +for class use. +

+ +
+ + + +

6.2 Managing libraries

+ + +

R packages are installed into libraries, which are +directories in the file system containing a subdirectory for each +package installed there. +

+

R comes with a single library, R_HOME/library which is +the value of the R object ‘.Library’ containing the standard and +recommended16 packages. +Both sites and users can create others and make use of them (or not) in +an R session. At the lowest level ‘.libPaths()’ can be used to +add paths to the collection of libraries or to report the current +collection. +

+ + +

R will automatically make use of a site-specific library +R_HOME/site-library if this exists (it does not in a +vanilla R installation). This location can be overridden by +setting17.Library.site’ in +R_HOME/etc/Rprofile.site, or (not recommended) by setting +the + +environment variable R_LIBS_SITE. Like ‘.Library’, the +site libraries are always included by ‘.libPaths()’. +

+ + + +

Users can have one or more libraries, normally specified by the +environment variable R_LIBS_USER. This has a default value (to +see it, use ‘Sys.getenv("R_LIBS_USER")’ within an R session), +but that is only used if the corresponding directory actually exists +(which by default it will not). +

+

Both R_LIBS_USER and R_LIBS_SITE can specify multiple +library paths, separated by colons (semicolons on Windows). +

+ +
+ + + +

6.3 Installing packages

+ + + + + + + + + + + +

Packages may be distributed in source form or compiled binary form. +Installing source packages which contain C/C++/Fortran code requires +that compilers and related tools be installed. Binary packages are +platform-specific and generally need no special tools to install, but +see the documentation for your platform for details. +

+

Note that you may need to specify implicitly or explicitly the library to +which the package is to be installed. This is only an issue if you have +more than one library, of course. +

+ + +

Ensure that the environment variable TMPDIR is either unset (and +/tmp exists and can be written in and executed from) or is the +absolute path to a valid temporary directory, not containing spaces. +

+

For most users it suffices to call +‘install.packages(pkgname)’ or its GUI equivalent if the +intention is to install a CRAN package and internet access is +available.18 On most systems ‘install.packages()’ +will allow packages to be selected from a list box (typically with +several thousand items). +

+

To install packages from source on a Unix-alike use in a terminal +

+
+
R CMD INSTALL -l /path/to/library pkg1 pkg2 …
+
+ +

The part ‘-l /path/to/library’ can be omitted, in which case the +first library of a normal R session is used (that shown by +.libPaths()[1]). +

+

There are a number of options available: use R CMD INSTALL --help +to see the current list. +

+ +

Alternatively, packages can be downloaded and installed from within +R. First choose your nearest CRAN mirror using +chooseCRANmirror(). Then download and install packages +pkg1 and pkg2 by +

+
+
> install.packages(c("pkg1", "pkg2"))
+
+ +

The essential dependencies of the specified packages will also be fetched. +Unless the library is specified (argument lib) the first library +in the library search path is used: if this is not writable, R will +ask the user (in an interactive session) if the default personal library +should be created, and if allowed to will install the packages there. +

+ +

If you want to fetch a package and all those it depends on (in any way) +that are not already installed, use e.g. +

+
+
> install.packages("Rcmdr", dependencies = TRUE)
+
+ +

install.packages can install a source package from a local +.tar.gz file (or a URL to such a file) by setting argument +repos to NULL: this will be selected automatically if the +name given is a single .tar.gz file. +

+

install.packages can look in several repositories, specified as a +character vector by the argument repos: these can include a +CRAN mirror, Bioconductor, Omegahat, R-forge, rforge.net, +local archives, local files, …). Function +setRepositories() can select amongst those repositories that the +R installation is aware of. +

+

Naive users sometimes forget that as well as installing a package, they +have to use library to make its functionality available. +

+
+ + + +

6.3.1 Windows

+ +

What install.packages does by default is different on Unix-alikes +(except OS X) and Windows. On Unix-alikes it consults the list of +available source packages on CRAN (or other +repository/ies), downloads the latest version of the package sources, +and installs them (via R CMD INSTALL). On Windows it looks (by +default) first at the list of binary versions of packages +available for your version of R and downloads the latest versions (if +any). If no binary version is available or the source version is newer, +it will install the source versions of packages without compiled +C/C++/Fortran code, and offer to do so for those with, if make +is available (and this can be tuned by option +"install.packages.compile.from.source"). +

+

On Windows install.packages can also install a binary package +from a local zip file (or the URL of such a file) by setting +argument repos to NULL. Rgui.exe has a menu +Packages with a GUI interface to install.packages, +update.packages and library. +

+

Windows binary packages for R are distributed as a single binary +containing either or both architectures (32- and 64-bit). +

+

A few of the binary packages need other software to be installed on your +system: see for example +https://CRAN.R-project.org/bin/windows/contrib/3.2/@ReadMe. +Packages using Gtk+ (Cairo, RGtk2, +cairoDevice and those that depend on them) need the bin +directory of a bundled distribution of Gtk2 from +http://ftp.gnome.org/pub/gnome/binaries/win32/gtk+ or +http://ftp.gnome.org/pub/gnome/binaries/win64/gtk+ in +the path: it should work to have both 32- and 64-bit Gtk+ bin +directories in the path on a 64-bit version of R. +

+

R CMD INSTALL works in Windows to install source packages. No +additional tools are needed if the package does not contain compiled +code, and install.packages(type="source") will work for such +packages (and for those with compiled code if the tools (see The Windows toolset) are in the path). We have seen occasional permission +problems after unpacking source packages on some systems: these have +been circumvented by setting the environment variable +R_INSTALL_TAR to ‘tar.exe’. + +

+

If you have only a source package that is known to work with current +R and just want a binary Windows build of it, you could make use of +the building service offered at +http://win-builder.r-project.org/. +

+

For almost all packages R CMD INSTALL will attempt to install +both 32- and 64-bit builds of a package if run from a 32/64-bit install +of R. It will report success if the installation of the architecture +of the running R succeeded, whether or not the other +architecture was successfully installed. The exceptions are packages +with a non-empty configure.win script or which make use of +src/Makefile.win. If configure.win does something +appropriate to both architectures use19 option +--force-biarch: otherwise R CMD INSTALL +--merge-multiarch can be applied to a source tarball to merge separate +32- and 64-bit installs. (This can only be applied to a tarball, and +will only succeed if both installs succeed.) +

+

If you have a package without compiled code and no Windows-specific +help, you can zip up an installation on another OS and install from that +zip file on Windows. However, such a package can be installed from the +sources on Windows without any additional tools. +

+ + +

There is provision to make use of a system-wide library of installed +external software by setting the make variable +LOCAL_SOFT, to give an equivalent of /usr/local on a +Unix-alike. This can be set in src/gnuwin/MkRules.local when +R is built from sources (see the comments in +src/gnuwin/MkRules.dist), or in file20 etc/i386/Makeconf or etc/x64/Makeconf for an +installed version of R. The version used by CRAN can be +installed as described in Building from source. +

+ +
+ + + +

6.3.2 OS X

+ +

On OS X install.packages works as it does on other Unix-alike +systems, but there are additional types starting with mac.binary +(available for the CRAN distribution but not when compiling +from source: mac.binary.mavericks for a ‘Mavericks’ build with +"default" a synonym for the appropriate variant) which can be +passed to install.packages in order to download and install +binary packages from a suitable repository. These OS X binary package +files have the extension ‘.tgz’. The R.APP GUI provides menus +for installation of either binary or source packages, from +CRAN or local files. +

+

On R builds using binary packages, the default is type both: +this looks first at the list of binary packages available for your +version of R and installs the latest versions (if any). If no binary +version is available or the source version is newer, it will install the +source versions of packages without compiled C/C++/Fortran code and offer +to do so for those with, if make is available. +

+

Note that most binary packages including compiled code are tied to a +particular series (e.g. R 3.2.x or 3.1.x) of R. +

+

You should not attempt to mix-and-match binary packages built for the +‘Snow Leopard’ and ‘Mavericks’ CRAN distributions: doing so is +likely to lead to crashes or failures to load. +

+

Installing source packages which do not contain compiled code should +work with no additional tools. For others you will need the +‘Command Line Tools’ for Xcode and compilers which match those +used to build R: see OS X. +

+

Package rJava and those which depend on it need a Java runtime +installed and several packages need X11 installed, including those using +Tk. For Mountain Lion and later see OS X and Java (OS X). +

+

Tcl/Tk extensions BWidget and Tktable are part of the +Tcl/Tk contained in the R installer. These are required by a number +of CRAN and Bioconductor packages. +

+

A few of the binary packages need other software to be installed on your +system. In particular packages using Gtk+ (RGtk2, +cairoDevice and those that depend on them) need the GTK +framework installed from https://r.research.att.com/libs/: the +appropriate version at the time of writing was +https://r.research.att.com/libs/GTK_2.24.17-X11.pkg +

+

The default compilers specified in +/Library/Frameworks/R.framework/Resources/etc/Makeconf depend on +the version of OS X under which R was installed, and are appropriate +to the latest version of the command-line tools for that version +of OS X. The settings can be changed, either by editing that file or in +a file such as ~/.R/Makevars (see the next section). Entries +which may need to be changed include ‘CC’, ‘CXX’, ‘FC’, +‘F77’, ‘FLIBS’ and the corresponding flags, and perhaps +‘CXXCPP’, ‘DYLIB_LD’, ‘MAIN_LD’, ‘SHLIB_CXXLD’, +‘SHLIB_FCLD’ and ‘SHLIB_LD’. +

+

So for example you could select clang for both C and C++ with +extensive checking by having in ~/.R/Makevars +

+
CC=clang
+CXX=clang++
+CFLAGS=-mtune=native -g -O2 -Wall -pedantic -Wconversion
+CXXFLAGS=-mtune=native -g -O2 -Wall -pedantic -Wconversion
+
+ +

and for a version of gfortran-4.2 we needed +

+
FLIBS=-lgfortran
+
+ + +
+ + + +

6.3.3 Customizing package compilation

+ +

The R system and package-specific compilation flags can be overridden or +added to by setting the appropriate Make variables in the personal file +HOME/.R/Makevars-R_PLATFORM (but +HOME/.R/Makevars.win or HOME/.R/Makevars.win64 +on Windows), or if that does not exist, HOME/.R/Makevars, +where ‘R_PLATFORM’ is the platform for which R was built, as +available in the platform component of the R variable +R.version. An alternative personal file can be specified +via the environment variable R_MAKEVARS_USER. +

+

Package developers are encouraged to use this mechanism to enable a +reasonable amount of diagnostic messaging (“warnings”) when compiling, +such as e.g. -Wall -pedantic for tools from GCC, the Gnu +Compiler Collection. +

+

Note that this mechanism can also be used when it necessary to change +the optimization level for a particular package. For example +

+
+
## for C code
+CFLAGS=-g -O -mtune=native
+## for C++ code
+CXXFLAGS=-g -O -mtune=native
+## for Fortran code
+FFLAGS=-g -O -mtune=native
+## for Fortran 9x code
+FCFLAGS=-g -O -mtune=native
+
+ +

Another use is to override the settings in a binary installation of R. +For example, to use a different Fortran compiler on OS X +

+
+
F77 = /usr/local/gfortran/bin/gfortran
+FC = /usr/local/gfortran/bin/gfortran
+FLIBS = -L/usr/local/gfortran/lib/gcc/x86_64-apple-darwin14/5.2.0 
+  -L/usr/local/gfortran/lib -lgfortran -lquadmath -lm
+
+ +

(line split for legibility here). +

+

There is also provision for a site-wide Makevars.site file under +R_HOME/etc (in a sub-architecture-specific directory if +appropriate). This is read immediately after Makeconf, and an +alternative file can be specified by environment variable +R_MAKEVARS_SITE. +

+

Note that these mechanisms do not work with packages which fail to pass +settings down to sub-makes, perhaps reading etc/Makeconf in +makefiles in subdirectories. Fortunately such packages are unusual. +

+
+ + + +

6.3.4 Multiple sub-architectures

+ +

When installing packages from their sources, there are some extra +considerations on installations which use sub-architectures. These are +commonly used on Windows but can in principle be used on other +platforms. +

+

When a source package is installed by a build of R which supports +multiple sub-architectures, the normal installation process installs the +packages for all sub-architectures. The exceptions are +

+
+
Unix-alikes
+
+

where there is an configure script, or a file src/Makefile. +

+
+
Windows
+
+

where there is a non-empty configure.win script, or a file +src/Makefile.win (with some exceptions where the package is known +to have an architecture-independent configure.win, or if +--force-biarch or field ‘Biarch’ in the DESCRIPTION +file is used to assert so). +

+
+
+ +

In those cases only the current architecture is installed. Further +sub-architectures can be installed by +

+
+
R CMD INSTALL --libs-only pkg
+
+ +

using the path to R or R --arch to select the +additional sub-architecture. There is also R CMD INSTALL +--merge-multiarch to build and merge the two architectures, starting +with a source tarball. +

+
+ + + +

6.3.5 Byte-compilation

+ +

The base and recommended packages are byte-compiled by default. Other +packages can be byte-compiled on installation by using R CMD +INSTALL with option --byte-compile or by +install.packages(type = "source", INSTALL_opts = +"--byte-compile"). +

+

Not all contributed packages work correctly when byte-compiled (for +example because they interfere with the sealing of namespaces). For +most packages (especially those which make extensive use of compiled +code) the speed-up is small. Unless a package is used frequently the +time spent in byte-compilation can outweigh the time saved in execution: +also byte-compilation can add substantially to the installed size of the +package. +

+

Byte-compilation can be controlled on a per-package basis by the +‘ByteCompile’ field in the DESCRIPTION file. +

+
+ + + +

6.4 Updating packages

+ + + +

The command update.packages() is the simplest way to ensure that +all the packages on your system are up to date. It downloads the list +of available packages and their current versions, compares it with those +installed and offers to fetch and install any that have later versions +on the repositories. +

+

An alternative interface to keeping packages up-to-date is provided by +the command packageStatus(), which returns an object with +information on all installed packages and packages available at multiple +repositories. The print and summary methods give an +overview of installed and available packages, the upgrade method +offers to fetch and install the latest versions of outdated packages. +

+

One sometimes-useful additional piece of information that +packageStatus() returns is the status of a package, as +"ok", "upgrade" or "unavailable" (in the currently +selected repositories). For example +

+
+
> inst <- packageStatus()$inst
+> inst[inst$Status != "ok", c("Package", "Version", "Status")]
+                  Package Version      Status
+Biobase           Biobase   2.8.0 unavailable
+RCurl               RCurl   1.4-2     upgrade
+Rgraphviz       Rgraphviz  1.26.0 unavailable
+rgdal               rgdal  0.6-27     upgrade
+
+ + +
+ + + +

6.5 Removing packages

+ + + +

Packages can be removed in a number of ways. From a command prompt they +can be removed by +

+
+
R CMD REMOVE -l /path/to/library pkg1 pkg2 …
+
+ +

From a running R process they can be removed by +

+
+
> remove.packages(c("pkg1", "pkg2"),
+                  lib = file.path("path", "to", "library"))
+
+ +

Finally, in most installations one can just remove the package directory +from the library. +

+ +
+ + + +

6.6 Setting up a package repository

+ + +

Utilities such as install.packages can be pointed at any +CRAN-style repository, and R users may want to set up their +own. The ‘base’ of a repository is a URL such as +http://www.omegahat.org/R/: this must be an URL scheme that +download.packages supports (which also includes ‘ftp://’ and +‘file://’ and on most systems ‘https://’). Under that base +URL there should be directory trees for one or more of the following +types of package distributions: +

+
    +
  • "source": located at src/contrib and containing +.tar.gz files. Other forms of compression can be used, e.g. +.tar.bz2 or .tar.xz files. Complete repositories contain +the sources corresponding to any binary packages, and in any case it is +wise to have a src/contrib area with a possibly empty +PACKAGES file. + +
  • "win.binary": located at bin/windows/contrib/x.y for +R versions x.y.z and containing .zip files for Windows. + +
  • "mac.binary.mavericks": located at +bin/macosx/mavericks/contrib/3.y for the CRAN build for +‘Mavericks’ (and later) for R versions 3.y.z, containing +.tgz files. + +
  • "mac.binary": located at +bin/macosx/contrib/3.y for a CRAN build for +‘Snow Leopard’ (and later) for R versions 3.y.z, containing +.tgz files. +
+ +

Each terminal directory must also contain a PACKAGES file. This +can be a concatenation of the DESCRIPTION files of the packages +separated by blank lines, but only a few of the fields are needed. The +simplest way to set up such a file is to use function +write_PACKAGES in the tools package, and its help explains +which fields are needed. Optionally there can also be a +PACKAGES.gz file, a gzip-compressed version of +PACKAGES—as this will be downloaded in preference to +PACKAGES it should be included for large repositories. (If you +have a mis-configured server that does not report correctly non-existent +files you may need PACKAGES.gz.) +

+

To add your repository to the list offered by setRepositories(), +see the help file for that function. +

+

Incomplete repositories are better specified via a +contriburl argument than via being set as a repository. +

+

A repository can contain subdirectories, when the descriptions in the +PACKAGES file of packages in subdirectories must include a line +of the form +

+
+
Path: path/to/subdirectory
+
+ +

—once again write_PACKAGES is the simplest way to set this up. +

+
+ + + +

6.7 Checking installed source packages

+ +

It can be convenient to run R CMD check on an installed +package, particularly on a platform which uses sub-architectures. The +outline of how to do this is, with the source package in directory +pkg (or a tarball filename): +

+
+
R CMD INSTALL -l libdir pkg > pkg.log 2>&1
+R CMD check -l libdir --install=check:pkg.log pkg
+
+ +

Where sub-architectures are in use the R CMD check line can be +repeated with additional architectures by +

+
+
R --arch arch CMD check -l libdir --extra-arch --install=check:pkg.log pkg
+
+ +

where --extra-arch selects only those checks which depend on +the installed code and not those which analyse the sources. (If +multiple sub-architectures fail only because they need different +settings, e.g. environment variables, --no-multiarch may need +to be added to the INSTALL lines.) On Unix-alikes the +architecture to run is selected by --arch: this can also be +used on Windows with R_HOME/bin/R.exe, but it is more usual +to select the path to the Rcmd.exe of the desired +architecture. +

+

So on Windows to install, check and package for distribution a source +package from a tarball which has been tested on another platform one +might use +

+
+
.../bin/i386/Rcmd INSTALL -l libdir tarball --build > pkg.log 2>&1
+.../bin/i386/Rcmd check -l libdir --extra-arch --install=check:pkg.log pkg
+.../bin/x64/Rcmd check -l libdir --extra-arch --install=check:pkg.log pkg
+
+ +

where one might want to run the second and third lines in a different +shell with different settings for environment variables and the path (to +find external software, notably for Gtk+). +

+

R CMD INSTALL can do a i386 install and then add the +x64 DLL from a single command by +

+
+
R CMD INSTALL --merge-multiarch -l libdir tarball
+
+ +

and --build can be added to zip up the installation. +

+
+ + + +

7 Internationalization and Localization

+ + + +

Internationalization refers to the process of enabling support +for many human languages, and localization to adapting to a +specific country and language. +

+ +

Current builds of R support all the character sets that the +underlying OS can handle. These are interpreted according to the + +current locale, a sufficiently complicated topic to merit a +separate section. Note though that R has no built-in support for +right-to-left languages and bidirectional output, relying on the OS +services. For example, how character vectors in UTF-8 containing both +English digits and Hebrew characters are printed is OS-dependent (and +perhaps locale-dependent). +

+

The other aspect of the internationalization is support for the +translation of messages. This is enabled in almost all builds of R. +

+ + + + + +
+ + + +

7.1 Locales

+ + +

A locale is a description of the local environment of the user, +including the preferred language, the encoding of characters, the +currency used and its conventions, and so on. Aspects of the locale are +accessed by the R functions Sys.getlocale and +Sys.localeconv. +

+

The system of naming locales is OS-specific. There is quite wide +agreement on schemes, but not on the details of their implementation. A +locale needs to specify +

    +
  • A human language. These are generally specified by a lower-case +two-character abbreviation following ISO 639 (see e.g. +https://en.wikipedia.org/wiki/ISO_639-1). + +
  • A ‘territory’, used mainly to specify the currency. These are generally +specified by an upper-case two-character abbreviation following ISO 3166 +(see e.g. https://en.wikipedia.org/wiki/ISO_3166). + +
  • A charset encoding, which determines both how a byte stream should be +divided into characters, and which characters the subsequences of bytes +represent. Sometimes the combination of language and territory is used +to specify the encoding, for example to distinguish between traditional +and simplified Chinese. + +
  • Optionally, a modifier, for example to indicate that Austria is to be +considered pre- or post-Euro. The modifier is also used to indicate the +script (@latin, @cyrillic for Serbian, @iqtelif) +or language dialect (e.g. @saaho, a dialect of Afar, and +@bokmal and @nynorsk, dialects of Norwegian regarded by +some OSes as separate languages, no and nn). +
+ +

R is principally concerned with the first (for translations) and +third. Note that the charset may be deducible from the language, as +some OSes offer only one charset per language. +

+ + + + + + +
+ +
+

+Next: , Previous: , Up: Locales   [Contents][Index]

+
+ +

7.1.1 Locales under Unix-alikes

+ +

Modern Linux uses the XPG21 locale specifications which have the form +‘en_GB’, ‘en_GB.UTF-8’, ‘aa_ER.UTF-8@saaho’, +‘de_AT.iso885915@euro’, the components being in the order listed +above. (See man locale and locale -a for more +details.) Similar schemes are used by most Unix-alikes: some (including +some distributions of Linux) use ‘.utf8’ rather than ‘.UTF-8’. +

+

Note that whereas UTF-8 locales are nowadays almost universally used, +locales such as ‘en_GB’ use 8-bit encodings for backwards +compatibility. +

+
+ + + +

7.1.2 Locales under Windows

+ +

Windows also uses locales, but specified in a rather less concise way. +Most users will encounter locales only via drop-down menus, but more +information and lists can be found at +https://msdn.microsoft.com/en-us/library/hzz3tw78(v=vs.80) +(or if Microsoft moves it yet again, search for ‘Windows language +country strings’). +

+

It offers only one encoding per language. +

+

Some care is needed with Windows’ locale names. For example, +chinese is Traditional Chinese and not Simplified Chinese as used +in most of the Chinese-speaking world. +

+
+ +
+

+Previous: , Up: Locales   [Contents][Index]

+
+ +

7.1.3 Locales under OS X

+ +

OS X supports locales in its own particular way, but the R GUI tries to +make this easier for users. See +https://developer.apple.com/documentation/MacOSX/Conceptual/BPInternational/ +for how users can set their locales. As with Windows, end users will +generally only see lists of languages/territories. Users of R in a +terminal may need to set the locale to something like ‘en_GB.UTF-8’ +if it defaults to ‘C’ (as it sometimes does when logging in +remotely and for batch jobs: note whether Terminal sets the +LANG environment variable is an (advanced) preference, but does so +by default). +

+

Internally OS X uses a form similar to Linux: the main difference from +other Unix-alikes is that where a character set is not specified it is +assumed to be UTF-8. +

+ +
+ +
+

+Previous: , Up: Internationalization   [Contents][Index]

+
+ +

7.2 Localization of messages

+ +

The preferred language for messages is by default taken from the locale. +This can be overridden first by the setting of the environment variable + + + + +LANGUAGE and then22 +by the environment variables LC_ALL, LC_MESSAGES and +LANG. (The last three are normally used to set the locale and so +should not be needed, but the first is only used to select the language +for messages.) The code tries hard to map locales to languages, but on +some systems (notably Windows) the locale names needed for the +environment variable LC_ALL do not all correspond to XPG language +names and so LANGUAGE may need to be set. (One example is +‘LC_ALL=es’ on Windows which sets the locale to Estonian and the +language to Spanish.) +

+

It is usually possible to change the language once R is running +via (not Windows) Sys.setlocale("LC_MESSAGES", +"new_locale"), or by setting an environment variable such as +LANGUAGE, provided23 the language you are changing to can be output in +the current character set. But this is OS-specific, and has been known +to stop working on an OS upgrade. +

+

Messages are divided into domains, and translations may be +available for some or all messages in a domain. R makes use of the +following domains. +

+
    +
  • Domain R for the C-level error and warning messages from the R +interpreter. + +
  • Domain R-pkg for the R stop, warning and +message messages in each package, including R-base for the +base package. + +
  • Domain pkg for the C-level messages in each package. + +
  • Domain RGui for the menus etc of the R for Windows GUI front-end. + +
+ +

Dividing up the messages in this way allows R to be extensible: as +packages are loaded, their message translation catalogues can be loaded +too. +

+

R can be built without support for translations, but it is enabled by +default. +

+

R-level and C-level domains are subtly different, for example in the way +strings are canonicalized before being passed for translation. +

+

Translations are looked for by domain according to the currently +specified language, as specifically as possible, so for example an +Austrian (‘de_AT’) translation catalogue will be used in preference +to a generic German one (‘de’) for an Austrian user. However, if a +specific translation catalogue exists but does not contain a +translation, the less specific catalogues are consulted. For example, +R has catalogues for ‘en_GB’ that translate the Americanisms +(e.g., ‘gray’) in the standard messages into English.24 Two other examples: there are catalogues +for ‘es’, which is Spanish as written in Spain and these will by +default also be used in Spanish-speaking Latin American countries, and +also for ‘pt_BR’, which are used for Brazilian locales but not for +locales specifying Portugal. +

+

Translations in the right language but the wrong charset are made use of + +by on-the-fly re-encoding. The LANGUAGE variable (only) can be a +colon-separated list, for example ‘se:de’, giving a set of +languages in decreasing order of preference. One special value is +‘en@quot’, which can be used in a UTF-8 locale to have American +error messages with pairs of single quotes translated to Unicode directional +quotes. +

+

If no suitable translation catalogue is found or a particular message is +not translated in any suitable catalogue, ‘English’25 is used. +

+

See https://developer.r-project.org/Translations30.html for how to +prepare and install translation catalogues. +

+
+ + + +

8 Choosing between 32- and 64-bit builds

+ +

Almost all current CPUs have both 32- and 64-bit sets of +instructions. Most OSes running on such CPUs offer the choice +of building a 32-bit or a 64-bit version of R (and details are given +below under specific OSes). For most a 32-bit version is the default, +but for some (e.g., ‘x86_64’ Linux and OS X >= 10.6) +64-bit is. +

+

All current versions of R use 32-bit integers and +ISO/IEC 6055926 double-precision reals, and so compute to +the same precision27 and with the same limits on the sizes of +numerical quantities. The principal difference is in the size of the +pointers. +

+

64-bit builds have both advantages and disadvantages: +

    +
  • The total virtual memory space made available to a 32-bit process is +limited by the pointer size to 4GB, and on most OSes to 3GB (or even +2GB). The limits for 64-bit processes are much larger (e.g. +8–128TB). + +

    R allocates memory for large objects as needed, and removes any +unused ones at garbage collection. When the sizes of objects become an +appreciable fraction of the address limit, fragmentation of the address +space becomes an issue and there may be no hole available that is the +size requested. This can cause more frequent garbage collection or the +inability to allocate large objects. As a guide, this will become an +issue for 32-bit builds with objects more than 10% of the size of the +address space (around 300Mb) or when the total size of objects in use is +around one third (around 1Gb). +

    +
  • Only 64-bit builds support ‘long vectors’, those with 2^{31} or +more elements (each of which needs 16GB of storage for a numeric +vector). + +
  • Most 32-bit OSes by default limit file sizes to 2GB (and this may also +apply to 32-bit builds on 64-bit OSes). This can often be worked +around: and configure selects suitable defines if this is +possible. (We have also largely worked around that limit on 32-bit +Windows.) 64-bit builds have much larger limits. + +
  • Because the pointers are larger, R’s basic structures are larger. +This means that R objects take more space and (usually) more time to +manipulate. So 64-bit builds of R will, all other things being +equal, run slower than 32-bit builds. (On Sparc Solaris the difference +was 15-20%.) + +
  • However, ‘other things’ may not be equal. In the specific case of +‘x86_64vsix86’, the 64-bit CPU has features +(such as SSE2 instructions) which are guaranteed to be present but are +optional on the 32-bit CPU, and also has more general-purpose registers. +This means that on chips like a desktop Intel Core 2 Duo the vanilla +64-bit version of R has been around 10% faster on both Linux and OS +X. (Laptop CPUs are usually relatively slower in 64-bit mode.) +
+ +

So, for speed you may want to use a 32-bit build (especially on a +laptop), but to handle large datasets (and perhaps large files) a 64-bit +build. You can often build both and install them in the same place: +See Sub-architectures. (This is done for the Windows binary +distributions.) +

+

Even on 64-bit builds of R there are limits on the size of R +objects (see help("Memory-limits"), some of which stem from the +use of 32-bit integers (especially in FORTRAN code). For example, the +dimensions of an array are limited to 2^{31} - 1. +

+
+ + + +

9 The standalone Rmath library

+ +

The routines supporting the distribution and +special28 functions in R +and a few others are declared in C header file Rmath.h. These +can be compiled into a standalone library for linking to other +applications. (Note that they are not a separate library when R is +built, and the standalone version differs in several ways.) +

+

The makefiles and other sources needed are in directory +src/nmath/standalone, so the following instructions assume that +is the current working directory (in the build directory tree on a +Unix-alike if that is separate from the sources). +

+

Rmath.h contains ‘R_VERSION_STRING’, which is a character +string containing the current R version, for example "3.2.0". +

+

There is full access to R’s handling of NaN, Inf and +-Inf via special versions of the macros and functions +

+
+
    ISNAN, R_FINITE, R_log, R_pow and R_pow_di
+
+ +

and (extern) constants R_PosInf, R_NegInf and NA_REAL. +

+

There is no support for R’s notion of missing values, in particular +not for NA_INTEGER nor the distinction between NA and +NaN for doubles. +

+

A little care is needed to use the random-number routines. You will +need to supply the uniform random number generator +

+
+
    double unif_rand(void)
+
+ +

or use the one supplied (and with a shared library or DLL you may +have to use the one supplied, which is the Marsaglia-multicarry with +an entry point +

+
+
    set_seed(unsigned int, unsigned int)
+
+ +

to set its seeds). +

+

The facilities to change the normal random number generator are +available through the constant N01_kind. This takes values +from the enumeration type +

+
+
typedef enum {
+    BUGGY_KINDERMAN_RAMAGE,
+    AHRENS_DIETER,
+    BOX_MULLER,
+    USER_NORM,
+    INVERSION,
+    KINDERMAN_RAMAGE
+} N01type;
+
+ +

(and ‘USER_NORM’ is not available). +

+ + + + + + +
+ + + +

9.1 Unix-alikes

+ +

If R has not already been made in the directory tree, +configure must be run as described in the main build +instructions. +

+

Then (in src/nmath/standalone) +

+
+
make
+
+ +

will make standalone libraries libRmath.a and libRmath.so +(libRmath.dylib on OS X): ‘make static’ and ‘make +shared’ will create just one of them. +

+

To use the routines in your own C or C++ programs, include +

+
+
#define MATHLIB_STANDALONE
+#include <Rmath.h>
+
+ +

and link against ‘-lRmath’ (and ‘-lm’ if needed on your OS). +The example file test.c does nothing useful, but is provided to +test the process (via make test). Note that you will probably +not be able to run it unless you add the directory containing + +libRmath.so to the LD_LIBRARY_PATH environment variable +(libRmath.dylib, DYLD_LIBRARY_PATH on OS X). +

+

The targets +

+
+
make install
+make uninstall
+
+ +

will (un)install the header Rmath.h and shared and static + +libraries (if built). Both prefix= and DESTDIR are +supported, together with more precise control as described for the main +build. +

+

make install’ installs a file for pkg-config to use by +e.g. +

+
+
$(CC) `pkg-config --cflags libRmath` -c test.c
+$(CC) `pkg-config --libs libRmath` test.o -o test
+
+ +

On some systems ‘make install-strip’ will install a stripped shared +library. +

+ + +
+ + + +

9.2 Windows

+ +

You need to set up29 almost all the +tools to make R and then run (in a Unix-like shell) +

+
+
(cd ../../gnuwin32; make MkRules)
+(cd ../../include; make -f Makefile.win config.h Rconfig.h Rmath.h)
+make -f Makefile.win
+
+ +

Alternatively, in a cmd.exe shell use +

+
+
cd ../../include
+make -f Makefile.win config.h Rconfig.h Rmath.h
+cd ../nmath/standalone
+make -f Makefile.win
+
+ + +

This creates a static library libRmath.a and a DLL +Rmath.dll. If you want an import library libRmath.dll.a +(you don’t need one), use +

+
+
make -f Makefile.win shared implib
+
+ +

To use the routines in your own C or C++ programs using MinGW, include +

+
+
#define MATHLIB_STANDALONE
+#include <Rmath.h>
+
+ +

and link against ‘-lRmath’. This will use the first found of +libRmath.dll.a, libRmath.a and Rmath.dll in that +order, so the result depends on which files are present. You should be +able to force static or dynamic linking via +

+
+
-Wl,-Bstatic -lRmath -Wl,dynamic
+-Wl,-Bdynamic -lRmath
+
+ +

or by linking to explicit files (as in the ‘test’ target in +Makefile.win: this makes two executables, test.exe which +is dynamically linked, and test-static.exe, which is statically +linked). +

+

It is possible to link to Rmath.dll using other compilers, either +directly or via an import library: if you make a MinGW import library as +above, you will create a file Rmath.def which can be used +(possibly after editing) to create an import library for other systems +such as Visual C++. +

+

If you make use of dynamic linking you should use +

+
+
#define MATHLIB_STANDALONE
+#define RMATH_DLL
+#include <Rmath.h>
+
+ +

to ensure that the constants like NA_REAL are linked correctly. +(Auto-import will probably work with MinGW, but it is better to be +sure. This is likely to also work with VC++, Borland and similar +compilers.) +

+ +
+ + + +

Appendix A Essential and useful other programs under a Unix-alike

+ +

This appendix gives details of programs you will need to build R on +Unix-like platforms, or which will be used by R if found by +configure. +

+

Remember that some package management systems (such as RPM and +Debian/Ubuntu’s) make a distinction between the user version of a +package and the development version. The latter usually has the same +name but with the extension ‘-devel’ or ‘-dev’: you need both +versions installed. +

+ + + + + + +
+ + + +

A.1 Essential programs and libraries

+ +

You need a means of compiling C and FORTRAN 90 (see Using FORTRAN). Your C compiler should be +ISO/IEC 6005930, POSIX 1003.1 and C99-compliant.31 R tries to choose suitable flags for +the C compilers it knows about, but you may have to set CC or +CFLAGS suitably. For many versions of gcc with +glibc this means including +-std=gnu9932. If the compiler is detected as +gcc 4.x, -std=gnu99 will be appended to +CC unless it conflicts with a setting of CFLAGS. (Note +that options essential to run the compiler even for linking, such as +those to set the architecture, should be specified as part of CC +rather than in CFLAGS.) +

+

Unless you do not want to view graphs on-screen (or use OS X) you need +‘X11’ installed, including its headers and client libraries. For +recent Fedora distributions it means (at least) RPMs ‘libX11’, +‘libX11-devel’, ‘libXt’ and ‘libXt-devel’. On Debian we +recommend the meta-package ‘xorg-dev’. If you really do not want +these you will need to explicitly configure R without X11, using +--with-x=no. +

+

The command-line editing (and command completion) depends on the +GNU readline library: version 4.2 or later is needed +for all the features to be enabled. Otherwise you will need to +configure with --with-readline=no (or equivalent). +

+

A suitably comprehensive iconv function is essential. The R +usage requires iconv to be able to translate between +"latin1" and "UTF-8", to recognize "" (as the +current encoding) and "ASCII", and to translate to and from the +Unicode wide-character formats "UCS-[24][BL]E" — this is true +by default for glibc33 but not of most commercial Unixes. However, you +can make use of GNU libiconv (as used on OS X: see +https://www.gnu.org/software/libiconv/). +

+

The OS needs to have enough support34 for wide-character +types: this is checked at configuration. A small number of POSIX +functions35 are essential, and others36 will be used if available. +

+

A tar program is needed to unpack the sources and packages +(including the recommended packages). A version37 that can +automagically detect compressed archives is preferred for use with +untar(): the configure script looks for gtar and +gnutar before + +tar – use environment variable TAR to override this. +

+

There need to be suitable versions of the tools grep and +sed: the problems are usually with old AT&T and BSD variants. +configure will try to find suitable versions (including +looking in /usr/xpg4/bin which is used on some commercial +Unixes). +

+

You will not be able to build most of the manuals unless you have +texi2any version 5.1 or later installed, and if not most of +the HTML manuals will be linked to a version on CRAN. To +make PDF versions of the manuals you will also need file +texinfo.tex installed (which is part of the GNU +texinfo distribution but is often made part of the TeX package +in re-distributions) as well as +texi2dvi.38 +Further, the versions of texi2dvi and texinfo.tex need +to be compatible: we have seen problems with older TeX distributions. +

+ +

The PDF documentation (including doc/NEWS.pdf) and building +vignettes needs pdftex and pdflatex. We require +LaTeX version 2005/12/01 or later (for UTF-8 support). +Building PDF package manuals (including the R reference manual) and +vignettes is sensitive to the version of the LaTeX package +hyperref and we recommend that the TeX distribution used is +kept up-to-date. A number of standard LaTeX packages are required +(including url and some of the font packages such as times, +helvetic, ec and cm-super) and others such as +hyperref and inconsolata are desirable (and without them you +may need to change R’s defaults: see Making the manuals). Note +that package hyperref (currently) requires packages +kvoptions, ltxcmds and refcount. For distributions +based on TeXLive the simplest approach may be to install collections +collection-latex, collection-fontsrecommended, +collection-latexrecommended, collection-fontsextra and +collection-latexextra (assuming they are not installed by +default): Fedora uses names like texlive-collection-fontsextra and +Debian/Ubuntu like texlive-fonts-extra. +

+ + +

If you want to build from the R Subversion repository then +texi2any is highly recommended as it is used to create files +in the tarball but not under Subversion. +

+ +

The essential programs should be in your PATH at the time +configure is run: this will capture the full paths. +

+
+ + + +

A.2 Useful libraries and programs

+ +

The ability to use translated messages makes use of gettext and +most likely needs GNU gettext: you do need this to work +with new translations, but otherwise the version contained in the R +sources will be used if no suitable external gettext is found. +

+

The ‘modern’ version of the X11(), jpeg(), png() +and tiff() graphics devices uses the cairo and +(optionally) Pango libraries. Cairo version 1.2.0 or later is +required. Pango needs to be at least version 1.10, and 1.12 is the +earliest version we have tested. (For Fedora users we believe the +pango-devel RPM and its dependencies suffice.) R checks for +pkg-config, and uses that to check first that the +‘pangocairo’ package is installed (and if not, ‘cairo’) and if +additional flags are needed for the ‘cairo-xlib’ package, then if +suitable code can be compiled. These tests will fail if +pkg-config is not installed39, and are likely to fail if cairo was built +statically (unusual). Most systems with Gtk+ 2.8 or later +installed will have suitable libraries +

+

For the best font experience with these devices you need suitable fonts +installed: Linux users will want the urw-fonts package. On +platforms which have it available, the msttcorefonts +package40 provides +TrueType versions of Monotype fonts such as Arial and Times New Roman. +Another useful set of fonts is the ‘liberation’ TrueType fonts available +at +https://fedorahosted.org/liberation-fonts/,41 which cover the Latin, Greek and Cyrillic alphabets +plus a fair range of signs. These share metrics with Arial, Times New +Roman and Courier New, and contain fonts rather similar to the first two +(https://en.wikipedia.org/wiki/Liberation_fonts). Then there +is the ‘Free UCS Outline Fonts’ project +(https://www.gnu.org/software/freefont/) which are +OpenType/TrueType fonts based on the URW fonts but with extended Unicode +coverage. See the R help on X11 on selecting such fonts. +

+

The bitmapped graphics devices jpeg(), png() and +tiff() need the appropriate headers and libraries installed: +jpeg (version 6b or later, or libjpeg-turbo) or +libpng (version 1.2.7 or later) and zlib or libtiff +(any recent version – 3.9.[4567] and 4.0.[23] have been tested) +respectively. They also need support for either X11 or +cairo (see above). Should support for these devices not +be required or broken system libraries need to be avoided there are +configure options --without-libpng, +--without-jpeglib and --without-libtiff. For most +system installations the TIFF libraries will require JPEG libraries to +be present and perhaps linked explicitly, so --without-jpeglib +may also disable the tiff() device. The tiff() devices +only require a basic build of libtiff (not even JPEG support is +needed). Recent versions allow several other libraries to be linked +into libtiff such as lzma, jbig and jpeg12, +and these may need also to be present. +

+

If you have them installed (including the appropriate headers and of +suitable versions), system versions of zlib (version 1.2.5 or +later),, libbz2 (version 1.0.6 or later: called +bzip2-libs/bzip2-devel or libbz2-1.0/libbz2-dev +by some Linux distributions) and PCRE (version 8.10 or later, preferably +8.32 or later42): will be used, +otherwise versions in the R sources will be compiled in. The +external versions can be avoided by configure options +--without-system-zlib, --without-system-bzlib and +--without-system-pcre. +

+

Option --with-system-tre is also available: it needs a recent +version of TRE. (The current sources are in the git repository +at https://github.com/laurikari/tre/, but at the time of writing +the resulting build will not pass its checks.). +

+

Library liblzma from xz-utils version 5.0.3 or later +(including 5.2.x) will be used if installed: the version in the R +sources can be selected instead by configuring with +--without-system-xz. Systems differ in what they call the +package including this: e.g. on Fedora the library is in +‘xz-libs’ and the headers in ‘xz-devel’. +

+

An implementation of XDR is required, and the R sources +contain one which is likely to suffice (although a system version may +have higher performance). XDR is part of RPC and +historically has been part of libc on a Unix-alike. However some +builds of glibc hide it with the intention that the +TI-RPC library be used instead, in which case libtirpc +(and its development version) needs to be installed, and its headers +need to be on the C include path or in /usr/include/tirpc. +

+

Use of the X11 clipboard selection requires the Xmu headers and +libraries. These are normally part of an X11 installation (e.g. the +Debian meta-package ‘xorg-dev’), but some distributions have split +this into smaller parts, so for example recent versions of Fedora +require the ‘libXmu’ and ‘libXmu-devel’ RPMs. +

+

Some systems (notably OS X and at least some FreeBSD systems) have +inadequate support for collation in multibyte locales. It is possible +to replace the OS’s collation support by that from ICU (International +Components for Unicode, http://site.icu-project.org/), and this +provides much more precise control over collation on all systems. ICU +is available as sources and as binary distributions for (at least) most +Linux distributions, Solaris, FreeBSD and AIX, usually as libicu +or icu4c. It will be used by default where available: should a +very old or broken version of ICU be found this can be suppressed by +--without-ICU. +

+

If libcurl version 7.28.0 or later is available (including its +development files), it will be linked in to support +curlGetHeaders and the "libcurl" methods of +download.file and url. This is recommended as it gives +access to ‘https://’ and ‘ftps://’ URLs. Information on +libcurl is found from the curl-config script: if that +is missing or needs to be overridden43 +there are macros described in file config.site. +

+

The bitmap and dev2bitmap devices and function +embedFonts() use ghostscript +(http://www.ghostscript.com/). This should either be in your +path when the command is run, or its full path specified by the +environment variable R_GSCMD at that time. + +

+ + + + + + + +
+ + + +

A.2.1 Tcl/Tk

+ +

The tcltk package needs Tcl/Tk >= 8.4 installed: the sources are +available at https://www.tcl.tk/. To specify the locations of the +Tcl/Tk files you may need the configuration options +

+
+
--with-tcltk
+

use Tcl/Tk, or specify its library directory +

+
--with-tcl-config=TCL_CONFIG
+

specify location of tclConfig.sh +

+
--with-tk-config=TK_CONFIG
+

specify location of tkConfig.sh +

+
+ +

or use the configure variables TCLTK_LIBS and +TCLTK_CPPFLAGS to specify the flags needed for linking against +the Tcl and Tk libraries and for finding the tcl.h and +tk.h headers, respectively. If you have both 32- and 64-bit +versions of Tcl/Tk installed, specifying the paths to the correct config +files may be necessary to avoid confusion between them. +

+

Versions of Tcl/Tk up to 8.5.18 and 8.6.4 have been tested (including +most versions of 8.4.x, but not recently). +

+

Note that the tk.h header includes44 X11 headers, so you will need X11 and its +development files installed. +

+
+ + + +

A.2.2 Java support

+ +

The build process looks for Java support on the host system, and if it +finds it sets some settings which are useful for Java-using packages +(such as rJava and JavaGD). This check can be +suppressed by configure option --disable-java. + +Configure variable JAVA_HOME can be set to point to a specific +JRE/JDK, on the configure command line or in the environment. +

+

Principal amongst these settings are some library paths to the Java +libraries and JVM, which are stored in environment variable + +R_JAVA_LD_LIBRARY_PATH in file R_HOME/etc/ldpaths (or +a sub-architecture-specific version). A typical setting for +‘x86_64’ Linux is +

+
+
JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.51-4.b16.fc21.x86_64/jre
+R_JAVA_LD_LIBRARY_PATH=${JAVA_HOME}/lib/amd64/server
+
+ +

Note that this unfortunately depends on the exact version of the JRE/JDK +installed, and so may need updating if the Java installation is updated. +This can be done by running R CMD javareconf which updates +settings in both etc/Makeconf and +R_HOME/etc/ldpaths. See R CMD javareconf --help for +details. +

+

Another way of overriding those settings is to set the environment variable + +R_JAVA_LD_LIBRARY_PATH (before R is started, hence not in +~/.Renviron), which suffices to run already-installed +Java-using packages. For example +

+
+
R_JAVA_LD_LIBRARY_PATH=/usr/lib/jvm/java-1.8.0/jre/lib/amd64/server
+
+ +

It may be possible to avoid this by specifying an invariant link as the +path. For example, on that system any of +

+
+
JAVA_HOME=/usr/lib/jvm/java
+JAVA_HOME=/usr/lib/jvm/java-1.8.0
+JAVA_HOME=/usr/lib/jvm/java-1.8.0/jre
+
+ +

worked. +

+
+ + + +

A.2.3 Other compiled languages

+ +

Some add-on packages need a C++ compiler. This is specified by the +configure variables CXX, CXXFLAGS and similar. +configure will normally find a suitable compiler. However, in +most cases this will be a C++98 compiler, and as from R 3.1.0 it is +possible to specify an alternative compiler for use with C++11 by the +configure variables CXX1X, CXX1XSTD, CXX1XFLAGS and +similar. Again, configure will normally find a suitable value +for CXX1XSTD if the compiler given by CXX is capable of +compiling C++11 code, but it is possible that a completely different +compiler will be needed. +

+

Other packages need full Fortran 90 (or later) support. For source +files with extension .f90 or .f95, the compiler defined by +the macro FC is used by R CMD INSTALL. This is found +when R is configured and is often the same as F77: note that +it is detected by the name of the command without a test that it can +actually compile Fortran 90 code. Set the configure variable FC +to override this if necessary: variables FCFLAGS, +FCPICFLAGS, FCLIBS, SHLIB_FCLD and +SHLIB_FCLDFLAGS might also need to be set. +

+

See file config.site in the R source for more details about +these variables. +

+
+ + + +

A.3 Linear algebra

+ + + + + + + + +
+ +
+

+Next: , Previous: , Up: Linear algebra   [Contents][Index]

+
+ +

A.3.1 BLAS

+ +

The linear algebra routines in R can make use of enhanced +BLAS (Basic Linear Algebra Subprograms, +http://www.netlib.org/blas/faq.html) routines. However, +these have to be explicitly requested at configure time: R provides +an internal BLAS which is well-tested and will be adequate for +most uses of R. +

+

You can specify a particular BLAS library via a value +for the configuration option --with-blas and not to use an +external BLAS library by --without-blas (the +default). If --with-blas is given with no =, its value +is taken from the + +environment variable BLAS_LIBS, set for example in +config.site. If neither the option nor the environment variable +supply a value, a search is made for a suitable BLAS. If the +value is not obviously a linker command (starting with a dash or giving +the path to a library), it is prefixed by ‘-l’, so +

+
+
--with-blas="foo"
+
+ +

is an instruction to link against ‘-lfoo’ to find an external +BLAS (which needs to be found both at link time and run time). +

+

The configure code checks that the external BLAS is complete +(it must include all double precision and double complex routines, as +well as LSAME), and appears to be usable. However, an external +BLAS has to be usable from a shared object (so must contain +position-independent code), and that is not checked. +

+

Some enhanced BLASes are compiler-system-specific +(sunperf on Solaris45, libessl on IBM, +Accelerate on OS X). The correct incantation for +these is usually found via --with-blas with no value on +the appropriate platforms. +

+

Some of the external BLASes are multi-threaded. One issue is +that R profiling (which uses the SIGPROF signal) may cause +problems, and you may want to disable profiling if you use a +multi-threaded BLAS. Note that using a multi-threaded +BLAS can result in taking more CPU time and even +more elapsed time (occasionally dramatically so) than using a similar +single-threaded BLAS. On a machine running other tasks, there +can be contention for CPU caches that reduces the effectiveness of the +optimization of cache use by a BLAS implementation. +

+

Note that under Unix (but not under Windows) if R is compiled against +a non-default BLAS and --enable-BLAS-shlib is +not used, then all BLAS-using packages must also be. +So if R is re-built to use an enhanced BLAS then packages +such as quantreg will need to be re-installed. +

+

R relies on ISO/IEC 60559 compliance of an +external BLAS. This can be broken if for example the code +assumes that terms with a zero factor are always zero and do not need to +be computed—whereas x*0 can be NaN. This is checked in +the test suite. +

+

External BLAS implementations often make less use of +extended-precision floating-point registers and will almost certainly +re-order computations. This can result in less accuracy than using the +internal BLAS, and may result in different solutions, e.g. +different signs in SVD and eigendecompositions. +

+

The URIs for several of these BLAS are subject to frequent gratuitous +changes, so you will need to search for their current locations. +

+ + + + + + + + +
+ +
+

+Next: , Previous: , Up: BLAS   [Contents][Index]

+
+ +

A.3.1.1 ATLAS

+ +

ATLAS (http://math-atlas.sourceforge.net/) is a “tuned” +BLAS that runs on a wide range of Unix-alike platforms. +Unfortunately it is built by default as a static library that on some +platforms cannot be used with shared objects such as are used in R +packages. Be careful when using pre-built versions of ATLAS (they seem +to work on ‘ix86’ platforms, but not always on ‘x86_64’ +ones). +

+

The usual way to specify ATLAS will be via +

+
+
--with-blas="-lf77blas -latlas"
+
+ +

if the libraries are in the library path, otherwise by +

+
+
--with-blas="-L/path/to/ATLAS/libs -lf77blas -latlas"
+
+ +

For example, ‘x86_64’ Fedora needs +

+
--with-blas="-L/usr/lib64/atlas -lf77blas -latlas"
+
+ +

For systems with multiple CPU cores it is possible to use a +multi-threaded version of ATLAS, by specifying +

+
+
--with-blas="-lptf77blas -lpthread -latlas"
+
+ +

Consult its installation guide for how to build ATLAS with +position-independent code, and as a shared library. +

+
+ +
+

+Next: , Previous: , Up: BLAS   [Contents][Index]

+
+ +

A.3.1.2 ACML

+ +

For ‘x86_64’ processors46 under Linux there is the AMD Core Math Library (ACML). +For the gcc version we could use +

+
+
--with-blas="-lacml"
+
+ +

if the appropriate library directory (such as + +/opt/acml5.1.0/gfortran64/lib) is in the LD_LIBRARY_PATH. +For other compilers, see the ACML documentation. There is a +multithreaded Linux version of ACML available for recent versions of +gfortran. To make use of this you will need something like +

+
+
--with-blas="-L/opt/acml5.1.0/gfortran64_mp/lib -lacml_mp"
+
+ +

(and you may need to arrange for the directory to be in ld.so +cache). +

+

See see Shared BLAS for an alternative (and in many ways preferable) +way to use ACML. +

+

The version last tested (5.1.0) failed the reg-BLAS.R test in its +handling of NAs. +

+
+ +
+

+Next: , Previous: , Up: BLAS   [Contents][Index]

+
+ +

A.3.1.3 Goto and OpenBLAS

+ +

Dr Kazushige Goto wrote a tuned BLAS for several processors and +OSes, which was frozen in mid-2010. The final version is known as +GotoBLAS2, and was re-released under a much less restrictive licence. +Once it is built and installed, it can be used by configuring R with +

+
+
--with-blas="-lgoto2"
+
+ +

See see Shared BLAS for an alternative (and in many ways preferable) +way to use it. +

+

OpenBLAS (http://www.openblas.net/) is a descendant +project with support for some later CPUs (e.g. Intel Sandy Bridge). +Once installed it can be used by something like +

+
+
--with-blas="-lopenblas"
+
+ +

or as a shared BLAS. +

+
+ +
+

+Next: , Previous: , Up: BLAS   [Contents][Index]

+
+ +

A.3.1.4 Intel MKL

+

For Intel processors (and perhaps others) and some distributions of +Linux, there is Intel’s Math Kernel Library. You are strongly +encouraged to read the MKL User’s Guide, which is installed with the +library, before attempting to link to MKL. This includes a ‘link line +advisor’ which will suggest appropriate incantations: its use is +recommended. Or see +https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor. +

+

There are also versions of MKL for OS X and Windows, but at the time +these were tried they did not work with the standard compilers used for +R on those platforms. +

+

The MKL interface has changed several times and may change again: the +following examples have been used with versions 10.3 to 11.3, for GCC +compilers on ‘x86_64’. +

+

To a sequential version of MKL we used +

+
+
MKL_LIB_PATH=/path/to/intel_mkl/lib/intel64
+export LD_LIBRARY_PATH=$MKL_LIB_PATH
+MKL="-L${MKL_LIB_PATH} -lmkl_gf_lp64 -lmkl_core -lmkl_sequential"
+./configure --with-blas="$MKL" --with-lapack
+
+ +

The option --with-lapack is used since MKL contains a tuned +copy of LAPACK as well as BLAS (see LAPACK), although this +can be omitted. +

+

Threaded MKL may be used by replacing the line defining the variable +MKL by +

+
+
MKL="-L${MKL_LIB_PATH} -lmkl_gf_lp64 -lmkl_core \
+     -lmkl_gnu_thread -dl -lpthread"
+
+ +

The default number of threads will be chosen by the OpenMP software, but +can be controlled by setting OMP_NUM_THREADS or +MKL_NUM_THREADS, and in recent versions seems to default to a +sensible value for sole use of the machine. +

+

It has been reported that +

+
+
--with-blas='-mkl=parallel' --with-lapack
+
+ +

worked with the Intel 2015.3 compilers on Centos 6. +

+
+ +
+

+Previous: , Up: BLAS   [Contents][Index]

+
+ +

A.3.1.5 Shared BLAS

+ +

The BLAS library will be used for many of the add-on packages +as well as for R itself. This means that it is better to use a +shared/dynamic BLAS library, as most of a static library will +be compiled into the R executable and each BLAS-using +package. +

+

R offers the option of compiling the BLAS into a dynamic +library libRblas stored in R_HOME/lib and linking +both R itself and all the add-on packages against that library. +

+

This is the default on all platforms except AIX unless an external +BLAS is specified and found: for the latter it can be used by +specifying the option --enable-BLAS-shlib, and it can always be +disabled via --disable-BLAS-shlib. +

+

This has both advantages and disadvantages. +

+
    +
  • It saves space by having only a single copy of the BLAS +routines, which is helpful if there is an external static BLAS +such as used to be standard for ATLAS. + +
  • There may be performance disadvantages in using a shared BLAS. +Probably the most likely is when R’s internal BLAS is used +and R is not built as a shared library, when it is possible to +build the BLAS into R.bin (and libR.a) without +using position-independent code. However, experiments showed that in +many cases using a shared BLAS was as fast, provided high +levels of compiler optimization are used. + +
  • It is easy to change the BLAS without needing to re-install +R and all the add-on packages, since all references to the +BLAS go through libRblas, and that can be replaced. +Note though that any dynamic libraries the replacement links to will +need to be found by the linker: this may need the library path to be +changed in R_HOME/etc/ldpaths. +
+ +

Another option to change the BLAS in use is to symlink a +dynamic BLAS library (such as ACML or Goto’s) to +R_HOME/lib/libRblas.so. For example, just +

+
+
mv R_HOME/lib/libRblas.so R_HOME/lib/libRblas.so.keep
+ln -s /opt/acml5.1.0/gfortran64_mp/lib/libacml_mp.so R_HOME/lib/libRblas.so
+
+

will change the BLAS in use to multithreaded ACML. A similar +link works for some versions of Goto BLAS, OpenBLAS and MKL +(provided the appropriate lib directory is in the run-time +library path or ld.so cache). +

+ +
+ +
+

+Next: , Previous: , Up: Linear algebra   [Contents][Index]

+
+ +

A.3.2 LAPACK

+ + +

Provision is made for using an external LAPACK library, principally to +cope with BLAS libraries which contain a copy of LAPACK (such +as sunperf on Solaris, Accelerate on OS X and ACML and MKL +on ‘ix86’/‘x86_64’ Linux). At least LAPACK version 3.2 +is required. This can only be done if --with-blas has been used. +

+

However, the likely performance gains are thought to be small (and may +be negative), and the default is not to search for a suitable LAPACK +library, and this is definitely not recommended. You can +specify a specific LAPACK library or a search for a generic library by +the configuration option --with-lapack. The default for +--with-lapack is to check the BLAS library and then +look for an external library ‘-llapack’. Sites searching for the +fastest possible linear algebra may want to build a LAPACK library using +the ATLAS-optimized subset of LAPACK. To do so specify something like +

+
+
--with-lapack="-L/path/to/ATLAS/libs -llapack -lcblas"
+
+ +

since the ATLAS subset of LAPACK depends on libcblas. A value +for --with-lapack can be set via the environment +variable + +LAPACK_LIBS, but this will only be used if --with-lapack +is specified (as the default value is no) and the BLAS library +does not contain LAPACK. +

+

Since ACML contains a full LAPACK, if selected as the BLAS it +can be used as the LAPACK via --with-lapack. +

+

If you do use --with-lapack, be aware of potential problems +with bugs in the LAPACK sources (or in the posted corrections to those +sources). In particular, bugs in DGEEV and DGESDD have +resulted in error messages such as +

+
+
DGEBRD gave error code -10
+
+ +

. Other potential problems are incomplete versions of the libraries, +seen several times in Linux distributions over the years. +

+

Please do bear in mind that using --with-lapack is +‘definitely not recommended’: it is provided only +because it is necessary on some platforms and because some users want to +experiment with claimed performance improvements. Reporting problems +where it is used unnecessarily will simply irritate the R helpers. +

+

Note too the comments about ISO/IEC 60559 +compliance in the section of external BLAS: these apply +equally to an external LAPACK, and for example the Intel MKL +documentation says +

+

LAPACK routines assume that input matrices do not contain IEEE 754 +special values such as INF or NaN values. Using these special values may +cause LAPACK to return unexpected results or become unstable. +

+ +

We rely on limited support in LAPACK for matrices with 2^{31} or +more elements: it is quite possible that an external LAPACK will not +have that support. +

+

If you have a pure FORTRAN 77 compiler which cannot compile LAPACK it +may be possible to use CLAPACK from +http://www.netlib.org/clapack/ by something like +

+
-with-lapack="-lclapack -lf2c"
+
+

provided these were built with position-independent code and the calling +conventions for double complex function return values match those in the +BLAS used, so it may be simpler to use CLAPACK built to use CBLAS and +

+
-with-lapack="-lclapack -lcblas -lf2c"
+
+ + + +
+ +
+

+Previous: , Up: Linear algebra   [Contents][Index]

+
+ +

A.3.3 Caveats

+ +

As with all libraries, you need to ensure that they and R were +compiled with compatible compilers and flags. For example, this has +meant that on Sun Sparc using the native compilers the flag +-dalign is needed if sunperf is to be used. +

+

On some systems it has been necessary that an external +BLAS/LAPACK was built with the same FORTRAN compiler used to +build R. +

+ + +
+ + + +

Appendix B Configuration on a Unix-alike

+ + + + + + + + + + + + +
+ + + +

B.1 Configuration options

+ +

configure has many options: running +

+
+
./configure --help
+
+ +

will give a list. Probably the most important ones not covered +elsewhere are (defaults in brackets) +

+
+
--with-x
+

use the X Window System [yes] +

+
--x-includes=DIR
+

X include files are in DIR +

+
--x-libraries=DIR
+

X library files are in DIR +

+
--with-readline
+

use readline library (if available) [yes] +

+
--enable-R-profiling
+

attempt to compile support for Rprof() [yes] +

+
--enable-memory-profiling
+

attempt to compile support for Rprofmem() and tracemem() [no] +

+
--enable-R-shlib
+

build R as a shared/dynamic library [no] +

+
--enable-BLAS-shlib
+

build the BLAS as a shared/dynamic library [yes, except on AIX] +

+
+ +

You can use --without-foo or --disable-foo for the +negatives. +

+

You will want to use --disable-R-profiling if you are building +a profiled executable of R (e.g. with ‘-pg)’. +

+

Flag --enable-R-shlib causes the make process to build R as +a dynamic (shared) library, typically called libR.so, and link +the main R executable R.bin against that library. This can +only be done if all the code (including system libraries) can be +compiled into a dynamic library, and there may be a +performance47 penalty. So you probably +only want this if you will be using an application which embeds R. +Note that C code in packages installed on an R system linked with +--enable-R-shlib is linked against the dynamic library and so +such packages cannot be used from an R system built in the default +way. Also, because packages are linked against R they are on some +OSes also linked against the dynamic libraries R itself is linked +against, and this can lead to symbol conflicts. +

+

For maximally effective use of valgrind, R should be +compiled with valgrind instrumentation. The configure option +is --with-valgrind-instrumentation=level, where +level is 0, 1 or 2. (Level 0 is the default and does not add +anything.) The system headers for valgrind can be requested +by option --with-system-valgrind-headers: they will be used if +present (on Linux they may be in a separate package such as +valgrind-devel). Note though that there is no guarantee that the +code in R will be compatible with future valgrind headers. +

+

If you need to re-configure R with different options you may need to run +make clean or even make distclean before doing so. +

+

The configure script has other generic options added by +autoconf and which are not supported for R: in particular +building for one architecture on a different host is not possible. +

+
+ + + +

B.2 Internationalization support

+ +

Translation of messages is supported via GNU gettext +unless disabled by the configure option --disable-nls. +The configure report will show NLS as one of the +‘Additional capabilities’ if support has been compiled in, and running +in an English locale (but not the C locale) will include +

+
+
  Natural language support but running in an English locale
+
+ +

in the greeting on starting R. +

+ +
+ + + +

B.3 Configuration variables

+ + +

If you need or want to set certain configure variables to something +other than their default, you can do that by either editing the file +config.site (which documents many of the variables you might want +to set: others can be seen in file etc/Renviron.in) or on the +command line as +

+
+
./configure VAR=value
+
+ +

If you are building in a directory different from the sources, there can +be copies of config.site in the source and the build directories, +and both will be read (in that order). In addition, if there is a file +~/.R/config, it is read between the config.site files in +the source and the build directories. +

+

There is also a general autoconf mechanism for +config.site files, which are read before any of those mentioned +in the previous paragraph. This looks first at a file specified by the + +environment variable CONFIG_SITE, and if not is set at files such +as /usr/local/share/config.site and +/usr/local/etc/config.site in the area (exemplified by +/usr/local) where R would be installed. +

+

These variables are precious, implying that they do not have to +be exported to the environment, are kept in the cache even if not +specified on the command line, checked for consistency between two +configure runs (provided that caching is used), and are kept during +automatic reconfiguration as if having been passed as command line +arguments, even if no cache is used. +

+

See the variable output section of configure --help for a list of +all these variables. +

+

If you find you need to alter configure variables, it is worth noting +that some settings may be cached in the file config.cache, and it +is a good idea to remove that file (if it exists) before re-configuring. +Note that caching is turned off by default: use the command line +option --config-cache (or -C) to enable caching. +

+ + + + + + + +
+ + + +

B.3.1 Setting paper size

+ +

One common variable to change is R_PAPERSIZE, which defaults to +‘a4’, not ‘letter’. (Valid values are ‘a4’, +‘letter’, ‘legal’ and ‘executive’.) +

+

This is used both when configuring R to set the default, and when +running R to override the default. It is also used to set the +paper size when making PDF manuals. +

+

The configure default will most often be ‘a4’ if R_PAPERSIZE +is unset. (If the (Debian Linux) program paperconf is found + +or the environment variable PAPERSIZE is set, these are used to +produce the default.) +

+
+ + + +

B.3.2 Setting the browsers

+ + +

Another precious variable is R_BROWSER, the default HTML +browser, which should take a value of an executable in the user’s path +or specify a full path. +

+ +

Its counterpart for PDF files is R_PDFVIEWER. +

+
+ + + +

B.3.3 Compilation flags

+ +

If you have libraries and header files, e.g., for GNU +readline, in non-system directories, use the variables LDFLAGS +(for libraries, using ‘-L’ flags to be passed to the linker) and +CPPFLAGS (for header files, using ‘-I’ flags to be passed to +the C/C++ preprocessors), respectively, to specify these locations. +These default to ‘-L/usr/local/lib’ (LDFLAGS, +‘-L/usr/local/lib64’ on most 64-bit Linux OSes) and +‘-I/usr/local/include’ (CPPFLAGS) to catch the most common +cases. If libraries are still not found, then maybe your +compiler/linker does not support re-ordering of -L and +-l flags (this has been reported to be a problem on HP-UX with +the native cc). In this case, use a different compiler (or a +front end shell script which does the re-ordering). +

+

These flags can also be used to build a faster-running version of R. +On most platforms using gcc, having ‘-O3’ in +CFLAGS and FFLAGS produces worthwhile performance gains +with gcc and gfortran, but may result in a less +reliable build (both segfaults and incorrect numeric computations have +been seen). On systems using the GNU linker (especially those +using R as a shared library), it is likely that including +‘-Wl,-O1’ in LDFLAGS is worthwhile, and +‘'-Bdirect,--hash-style=both,-Wl,-O1'’ is recommended at +https://lwn.net/Articles/192624/. Tuning compilation to a +specific CPU family (e.g. ‘-mtune=native’ for +gcc) can give worthwhile performance gains, especially on +older architectures such as ‘ix86’. +

+
+ + + +

B.3.4 Making manuals

+ + + +

The default settings for making the manuals are controlled by +R_RD4PDF and R_PAPERSIZE. +

+
+ + + +

B.4 Setting the shell

+ +

By default the shell scripts such as R will be ‘#!/bin/sh’ +scripts (or using the SHELL chosen by configure). This is +almost always satisfactory, but on a few systems /bin/sh is not a +Bourne shell or clone, and the shell to be used can be changed by +setting the configure variable R_SHELL to a suitable value (a full +path to a shell, e.g. /usr/local/bin/bash). +

+
+ + + +

B.5 Using make

+ + +

To compile R, you will most likely find it easiest to use +GNU make, although the Sun make works on +Solaris. The native make has been reported to fail on SGI +Irix 6.5 and Alpha/OSF1 (aka Tru64). +

+

To build in a separate directory you need a make that supports +the VPATH variable, for example GNU make and +Sun make. +

+

dmake has also been used. e.g, on Solaris 10. +

+

If you want to use a make by another name, for example if your +GNU make is called ‘gmake’, you need to set the +variable MAKE at configure time, for example +

+ +
+
./configure MAKE=gmake
+
+ +
+ + + +

B.6 Using FORTRAN

+ + +

To compile R, you need a FORTRAN compiler. The default +is to search for +f95, fort, xlf95, +ifort, ifc, efc, pgf95 +lf95, gfortran, ftn, g95, +f90, xlf90, pghpf, pgf90, +epcf90, +g77, f77, xlf, frt, +pgf77, cf77, fort77, fl32, +af77 (in that order)48, and use whichever is found first; if none is found, +R cannot be compiled. +However, if CC is gcc, the matching FORTRAN compiler +(g77 for gcc 3 and gfortran for +gcc 4) is used if available. +

+

The search mechanism can be changed using the configure variable +F77 which specifies the command that runs the FORTRAN 77 +compiler. If your FORTRAN compiler is in a non-standard location, you + +should set the environment variable PATH accordingly before +running configure, or use the configure variable F77 to +specify its full path. +

+

If your FORTRAN libraries are in slightly peculiar places, you should + +also look at LD_LIBRARY_PATH or your system’s equivalent to make +sure that all libraries are on this path. +

+

Note that only FORTRAN compilers which convert identifiers to lower case +are supported. +

+

You must set whatever compilation flags (if any) are needed to ensure +that FORTRAN integer is equivalent to a C int pointer and +FORTRAN double precision is equivalent to a C double +pointer. This is checked during the configuration process. +

+

Some of the FORTRAN code makes use of COMPLEX*16 variables, which +is a Fortran 90 extension. This is checked for at configure +time49, but you may need to avoid +compiler flags asserting FORTRAN 77 compliance. +

+

Compiling the version of LAPACK in the R sources also requires some +Fortran 90 extensions, but these are not needed if an external LAPACK is +used. +

+

It might be possible to use f2c, the FORTRAN-to-C converter +(http://www.netlib.org/f2c), via a script. (An example script +is given in scripts/f77_f2c: this can be customized by setting + + + +the environment variables F2C, F2CLIBS, CC and + +CPP.) You will need to ensure that the FORTRAN type +integer is translated to the C type int. Normally +f2c.h contains ‘typedef long int integer;’, which will work +on a 32-bit platform but needs to be changed to ‘typedef int +integer;’ on a 64-bit platform. If your compiler is not gcc +you will need to set + +FPICFLAGS appropriately. Also, the included LAPACK sources +contain constructs that f2c is unlikely to be able to process, +so you would need to use an external LAPACK library (such as CLAPACK +from http://www.netlib.org/clapack/). +

+
+ + + +

B.7 Compile and load flags

+ +

A wide range of flags can be set in the file config.site or as +configure variables on the command line. We have already mentioned +

+
+
CPPFLAGS
+

header file search directory (-I) and any other miscellaneous +options for the C and C++ preprocessors and compilers +

+
LDFLAGS
+

path (-L), stripping (-s) and any other miscellaneous +options for the linker +

+
+ +

and others include +

+
+
CFLAGS
+

debugging and optimization flags, C +

+
MAIN_CFLAGS
+

ditto, for compiling the main program +

+
SHLIB_CFLAGS
+

for shared objects +

+
FFLAGS
+

debugging and optimization flags, FORTRAN +

+
SAFE_FFLAGS
+

ditto for source files which need exact floating point behaviour +

+
MAIN_FFLAGS
+

ditto, for compiling the main program +

+
SHLIB_FFLAGS
+

for shared objects +

+
MAIN_LDFLAGS
+

additional flags for the main link +

+
SHLIB_LDFLAGS
+

additional flags for linking the shared objects +

+
LIBnn
+

the primary library directory, lib or lib64 +

+
CPICFLAGS
+

special flags for compiling C code to be turned into a shared object +

+
FPICFLAGS
+

special flags for compiling Fortran code to be turned into a shared object +

+
CXXPICFLAGS
+

special flags for compiling C++ code to be turned into a shared object +

+
FCPICFLAGS
+

special flags for compiling Fortran 95 code to be turned into a shared object +

+
DEFS
+

defines to be used when compiling C code in R itself +

+
+ +

Library paths specified as -L/lib/path in LDFLAGS are + +collected together and prepended to LD_LIBRARY_PATH (or your +system’s equivalent), so there should be no need for -R or +-rpath flags. +

+

Variables such as CPICFLAGS are determined where possible by +configure. Some systems allows two types of PIC flags, for +example ‘-fpic’ and ‘-fPIC’, and if they differ the first +allows only a limited number of symbols in a shared object. Since R +as a shared library has about 6200 symbols, if in doubt use the larger +version. +

+

To compile a profiling version of R, one might for example want to +use ‘MAIN_CFLAGS=-pg’, ‘MAIN_FFLAGS=-pg’, +‘MAIN_LDFLAGS=-pg’ on platforms where ‘-pg’ cannot be used +with position-independent code. +

+

Beware: it may be necessary to set CFLAGS and +FFLAGS in ways compatible with the libraries to be used: one +possible issue is the alignment of doubles, another is the way +structures are passed. +

+

On some platforms configure will select additional flags for +CFLAGS, CPPFLAGS, FFLAGS, CXXFLAGS and +LIBS in R_XTRA_CFLAGS (and so on). These are for options +which are always required, for example to force IEC 60559 +compliance. +

+
+ + + +

B.8 Maintainer mode

+ +

There are several files that are part of the R sources but can be +re-generated from their own sources by configuring with option +--enable-maintainer-mode and then running make in the +build directory. This requires other tools to be installed, discussed +in the rest of this section. +

+

File configure is created from configure.ac and the files +under m4 by autoconf and aclocal. There is a +formal version requirement on autoconf of 2.62 or later, but +it is unlikely that anything other than the most recent versions have +been thoroughly tested. +

+

File src/include/config.h is created by autoheader. +

+

Grammar files *.y are converted to C sources by an implementation +of yacc, usually bison -y: these are found in +src/main and src/library/tools/src. It is known that +earlier versions of bison generate code which reads (and in +some cases writes) outside array bounds: bison 2.6.1 was found +to be satisfactory. +

+

The ultimate sources for package compiler are in its noweb +directory. To re-create the sources from +src/library/compiler/noweb/compiler.nw, the command +notangle is required. This is likely to need to be installed +from the sources at https://www.cs.tufts.edu/~nr/noweb/ (and can +also be found on CTAN). The package sources are only re-created even in +maintainer mode if src/library/compiler/noweb/compiler.nw has +been updated. +

+

It is likely that in future creating configure will need the GNU +‘autoconf archive’ installed. This can be found at +https://www.gnu.org/software/autoconf-archive/ and as a package +(usually called autoconf-archive) in most packaged distributions, +for example Debian, Fedora, OpenCSW, Homebrew and MacPorts. +

+
+ + + +

Appendix C Platform notes

+ +

This section provides some notes on building R on different Unix-alike +platforms. These notes are based on tests run on one or two systems in +each case with particular sets of compilers and support libraries. +Success in building R depends on the proper installation and functioning +of support software; your results may differ if you have other versions +of compilers and support libraries. +

+

Older versions of this manual (for R < 2.10.0) contain notes on +platforms such as HP-UX, IRIX and Alpha/OSF1 for which we have had no +recent reports. +

+

C macros to select particular platforms can be tricky to track down +(there is a fair amount of misinformation on the Web). The Wiki +(currently) at http://sourceforge.net/p/predef/wiki/Home/ can be +helpful. The R sources currently use +

+
AIX: _AIX
+Cygwin: __CYGWIN__
+FreeBSD: __FreeBSD__
+HP-UX: __hpux__, __hpux
+IRIX: sgi, __sgi
+Linux: __linux__
+OS X: __APPLE__
+NetBSD: __NetBSD__
+OpenBSD: __OpenBSD__
+Solaris: __sun, sun
+Windows: _WIN32, _WIN64
+
+ + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Platform notes   [Contents][Index]

+
+ +

C.1 X11 issues

+ +

The ‘X11()’ graphics device is the one started automatically on +Unix-alikes when plotting. As its name implies, it displays on a (local +or remote) X server, and relies on the services provided by the X +server. +

+

The ‘modern’ version of the ‘X11()’ device is based on ‘cairo’ +graphics and (in most implementations) uses ‘fontconfig’ to pick and +render fonts. This is done on the server, and although there can be +selection issues, they are more amenable than the issues with +‘X11()’ discussed in the rest of this section. +

+

When X11 was designed, most displays were around 75dpi, whereas today +they are of the order of 100dpi or more. If you find that X11() +is reporting50 missing font sizes, especially larger ones, it is likely +that you are not using scalable fonts and have not installed the 100dpi +versions of the X11 fonts. The names and details differ by system, but +will likely have something like Fedora’s +

+
xorg-x11-fonts-75dpi
+xorg-x11-fonts-100dpi
+xorg-x11-fonts-ISO8859-2-75dpi
+xorg-x11-fonts-Type1
+xorg-x11-fonts-cyrillic
+
+ +

and you need to ensure that the ‘-100dpi’ versions are installed +and on the X11 font path (check via xset -q). The +‘X11()’ device does try to set a pointsize and not a pixel size: +laptop users may find the default setting of 12 too large (although very +frequently laptop screens are set to a fictitious dpi to appear like a +scaled-down desktop screen). +

+

More complicated problems can occur in non-Western-European locales, so +if you are using one, the first thing to check is that things work in +the C locale. The likely issues are a failure to find any fonts +or glyphs being rendered incorrectly (often as a pair of ASCII +characters). X11 works by being asked for a font specification and +coming up with its idea of a close match. For text (as distinct from +the symbols used by plotmath), the specification is the first element of +the option "X11fonts" which defaults to +

+
+
"-adobe-helvetica-%s-%s-*-*-%d-*-*-*-*-*-*-*"
+
+ +

If you are using a single-byte encoding, for example ISO 8859-2 in +Eastern Europe or KOI8-R in Russian, use xlsfonts to find an +appropriate family of fonts in your encoding (the last field in the +listing). If you find none, it is likely that you need to install +further font packages, such as ‘xorg-x11-fonts-ISO8859-2-75dpi’ and +‘xorg-x11-fonts-cyrillic’ shown in the listing above. +

+

Multi-byte encodings (most commonly UTF-8) are even more complicated. +There are few fonts in ‘iso10646-1’, the Unicode encoding, and they +only contain a subset of the available glyphs (and are often fixed-width +designed for use in terminals). In such locales fontsets are +used, made up of fonts encoded in other encodings. If the locale you +are using has an entry in the ‘XLC_LOCALE’ directory (typically +/usr/share/X11/locale, it is likely that all you need to do is to +pick a suitable font specification that has fonts in the encodings +specified there. If not, you may have to get hold of a suitable locale +entry for X11. This may mean that, for example, Japanese text can be +displayed when running in ‘ja_JP.UTF-8’ but not when running in +‘en_GB.UTF-8’ on the same machine (although on some systems many +UTF-8 X11 locales are aliased to ‘en_US.UTF-8’ which covers several +character sets, e.g. ISO 8859-1 (Western European), JISX0208 (Kanji), +KSC5601 (Korean), GB2312 (Chinese Han) and JISX0201 (Kana)). +

+

On some systems scalable fonts are available covering a wide range of +glyphs. One source is TrueType/OpenType fonts, and these can provide +high coverage. Another is Type 1 fonts: the URW set of Type 1 fonts +provides standard typefaces such as Helvetica with a larger coverage of +Unicode glyphs than the standard X11 bitmaps, including Cyrillic. These +are generally not part of the default install, and the X server may need +to be configured to use them. They might be under the X11 fonts +directory or elsewhere, for example, +

+
+
/usr/share/fonts/default/Type1
+/usr/share/fonts/ja/TrueType
+
+ + +
+ +
+

+Next: , Previous: , Up: Platform notes   [Contents][Index]

+
+ +

C.2 Linux

+ + +

Linux is the main development platform for R, so compilation from the +sources is normally straightforward with the standard compilers. +

+

Remember that some package management systems (such as RPM and +deb) make a distinction between the user version of a package and the +developer version. The latter usually has the same name but with the +extension ‘-devel’ or ‘-dev’: you need both versions +installed. So please check the configure output to see if the +expected features are detected: if for example ‘readline’ is +missing add the developer package. (On most systems you will also need +‘ncurses’ and its developer package, although these should be +dependencies of the ‘readline’ package(s).) You should expect to +see in the configure summary +

+
+
  Interfaces supported:      X11, tcltk
+  External libraries:        readline, zlib, bzlib, lzma, PCRE, curl
+  Additional capabilities:   PNG, JPEG, TIFF, NLS, cairo, ICU
+
+ +

When R has been installed from a binary distribution there are +sometimes problems with missing components such as the FORTRAN +compiler. Searching the ‘R-help’ archives will normally reveal +what is needed. +

+

It seems that ‘ix86’ Linux accepts non-PIC code in shared +libraries, but this is not necessarily so on other platforms, in +particular on 64-bit CPUs such as ‘x86_64’. So care +can be needed with BLAS libraries and when building R as a +shared library to ensure that position-independent code is used in any +static libraries (such as the Tcl/Tk libraries, libpng, +libjpeg and zlib) which might be linked against. +Fortunately these are normally built as shared libraries with the +exception of the ATLAS BLAS libraries. +

+

The default optimization settings chosen for CFLAGS etc are +conservative. It is likely that using -mtune will result in +significant performance improvements on recent CPUs (especially for +‘ix86’): one possibility is to add -mtune=native for +the best possible performance on the machine on which R is being +installed: if the compilation is for a site-wide installation, it may +still be desirable to use something like +-mtume=core2.51 It is also possible to increase the +optimization levels to -O3: however for many versions of the +compilers this has caused problems in at least one CRAN +package. +

+

For platforms with both 64- and 32-bit support, it is likely that +

+
+
LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib"
+
+ +

is appropriate since most (but not all) software installs its 64-bit +libraries in /usr/local/lib64. To build a 32-bit version of R +on ‘x86_64’ with Fedora 21 we used +

+
+
CC="gcc -m32"
+CXX="g++ -m32"
+F77="gfortran -m32"
+FC=${F77}
+OBJC=${CC}
+LDFLAGS="-L/usr/local/lib"
+LIBnn=lib
+
+ +

Note the use of ‘LIBnn’: ‘x86_64’ Fedora installs its +64-bit software in /usr/lib64 and 32-bit software in +/usr/lib. Linking will skip over inappropriate binaries, but for +example the 32-bit Tcl/Tk configure scripts are in /usr/lib. It +may also be necessary to set the pkg-config path, e.g. by +

+
+
export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:/usr/lib/pkgconfig
+
+ +

64-bit versions of Linux are built with support for files > 2Gb, and +32-bit versions will be if possible unless --disable-largefile +is specified. +

+

To build a 64-bit version of R on ‘ppc64’ (also known as +‘powerpc64’) with gcc 4.1.1, Ei-ji Nakama used +

+
+
CC="gcc -m64"
+CXX="gxx -m64"
+F77="gfortran -m64"
+FC="gfortran -m64"
+CFLAGS="-mminimal-toc -fno-optimize-sibling-calls -g -O2"
+FFLAGS="-mminimal-toc -fno-optimize-sibling-calls -g -O2"
+
+ +

the additional flags being needed to resolve problems linking against +libnmath.a and when linking R as a shared library. +

+ + + + + + +
+ +
+

+Next: , Previous: , Up: Linux   [Contents][Index]

+
+ +

C.2.1 Clang

+ +

R has been built with Linux ‘ix86’ and ‘x86_64’ C and +C++ compilers (http://clang.llvm.org) based on the Clang +front-ends, invoked by CC=clang CXX=clang++, together with +gfortran. These take very similar options to the +corresponding GCC compilers. +

+

This has to be used in conjunction with a Fortran compiler: the +configure code will remove -lgcc from FLIBS, +which is needed for some versions of gfortran. +

+

The current default for clang++ is to use the C++ runtime from +the installed g++. Using the runtime from the libc++ +project (http://libcxx.llvm.org/) has also been tested: for some +R packages only the variant using libcxxabi was successful. +

+

Most builds of clang have no OpenMP support. Builds of +version 3.7.0 or later may.52 +

+
+ +
+

+Next: , Previous: , Up: Linux   [Contents][Index]

+
+ +

C.2.2 Intel compilers

+ +

Intel compilers have been used under ‘ix86’ and ‘x86_64’ +Linux. Brian Ripley used version 9.0 of the compilers for +‘x86_64’ on Fedora Core 5 with +

+
+
CC=icc
+CFLAGS="-g -O3 -wd188 -ip -mp"
+F77=ifort
+FLAGS="-g -O3 -mp"
+CXX=icpc
+CXXFLAGS="-g -O3 -mp"
+FC=ifort
+FCFLAGS="-g -O3 -mp"
+ICC_LIBS=/opt/compilers/intel/cce/9.1.039/lib
+IFC_LIBS=/opt/compilers/intel/fce/9.1.033/lib
+LDFLAGS="-L$ICC_LIBS -L$IFC_LIBS -L/usr/local/lib64"
+SHLIB_CXXLD=icpc
+
+ +

configure will add ‘-c99’ to CC for +C99-compliance. This causes warnings with icc 10 and later, so +use CC="icc -std=c99" there. The flag -wd188 suppresses +a large number of warnings about the enumeration type ‘Rboolean’. +Because the Intel C compiler sets ‘__GNUC__’ without complete +emulation of gcc, we suggest adding CPPFLAGS=-no-gcc. +

+

To maintain correct IEC 60559 arithmetic you most likely +need add flags to CFLAGS, FFLAGS and CXXFLAGS such +as -mp (shown above) or -fp-model precise -fp-model +source, depending on the compiler version. +

+

Others have reported success with versions 10.x and 11.x. +% https://stat.ethz.ch/pipermail/r-devel/2015-September/071717.html +Bjørn-Helge Mevik reported success with version 2015.3 of the compilers, +using (for a SandyBridge CPU on Centos 6.x) +

+
+
fast="-fp-model precise -ip -O3 -opt-mem-layout-trans=3 -xHost -mavx"
+CC=icc
+CFLAGS="$fast -wd188"
+F77=ifort
+FFLAGS="$fast"
+CXX=icpc
+CXXFLAGS="$fast"
+FC=$F77
+FCFLAGS=$F77FLAGS
+
+ + +
+ +
+

+Previous: , Up: Linux   [Contents][Index]

+
+ +

C.2.3 Oracle Solaris Studio compilers

+ +

Brian Ripley tested the Sun Studio 12 compilers, since renamed to Oracle +Solaris Studio. On ‘x86_64’ Linux with +

+
+
CC=suncc
+CFLAGS="-xO5 -xc99 -xlibmil -nofstore"
+CPICFLAGS=-Kpic
+F77=sunf95
+FFLAGS="-O5 -libmil -nofstore"
+FPICFLAGS=-Kpic
+CXX="sunCC -library=stlport4"
+CXXFLAGS="-xO5 -xlibmil -nofstore -features=tmplrefstatic"
+CXXPICFLAGS=-Kpic
+FC=sunf95
+FCFLAGS=$FFLAGS
+FCPICFLAGS=-Kpic
+LDFLAGS=-L/opt/sunstudio12.1/rtlibs/amd64
+SHLIB_LDFLAGS=-shared
+SHLIB_CXXLDFLAGS=-G
+SHLIB_FCLDFLAGS=-G
+SAFE_FFLAGS="-O5 -libmil"
+
+ +

-m64 could be added, but was the default. Do not use +-fast: see the warnings under Solaris. (The C++ options are +also explained under Solaris.) +

+

Others have found on at least some versions of ‘ix86’ Linux that +the configure flag --disable-largefile was needed (since +glob.h on that platform presumed gcc was being used). +

+ +
+ +
+

+Next: , Previous: , Up: Platform notes   [Contents][Index]

+
+ +

C.3 OS X

+ + +

To build R you need to have installed Apple’s ‘Command Line Tools’ +(on some versions installing Xcode suffices). You also need +readline (or to configure using --without-readline), and +a Fortran compiler. Those and other binary components are available +from https://r.research.att.com/libs. +

+

An X sub-system is required unless configuring using +--without-x: see https://xquartz.macosforge.org/. (Note +that XQuartz will likely need to be re-installed after an OS upgrade.) +To build R you need Apple’s ‘Command Line Tools’: these can be +(re-)installed by xcode-select --install. (If you have a +fresh OS installation, running e.g. make in a terminal will +offer the installation of the command-line tools. If you have installed +Xcode, this provides the command-line tools. The tools will need to be +reinstalled when OS X is upgraded, as upgrading partially removes them.) +

+

The instructions here are for ‘x86_64’ builds on 10.6 (Snow +Leopard) or later. In principle R can be built for 10.4.x, 10.5.x +and for PowerPC or 32-bit Intel Macs but these has not been tested +recently. +

+

To use the quartz() graphics device you need to configure with +--with-aqua (which is the default): quartz() then +becomes the default device when running R at the console and X11 +would only be used for the command-line-R data editor/viewer and one +version of Tcl/Tk. (This needs an Objective-C compiler53 which can compile the source code of +quartz().) +

+

Use --without-aqua if you want a standard Unix-alike build: +apart from disabling quartz() and the ability to use the build +with R.APP, it also changes the default location of the personal +library (see ?.libPaths). Also use +--disable-R-framework to install in the standard layout. +

+

Various compilers can be used. The current CRAN ‘Mavericks’ +distribution of R is built using +

+
+
CC=clang
+CXX=clang++
+F77=gfortran-4.8
+FC=$F77
+OBJC=clang
+CFLAGS=-Wall -mtune=core2 -g -O2
+CXXFLAGS=-Wall -mtune=core2 -g -O2'
+OBJCFLAGS=-Wall -mtune=core2 -g -O2
+F77FLAGS=-Wall -g -O2
+FCFLAGS=$F77FLAGS
+
+ +

with clang and clang++ from the ‘Command Line +Tools’ and the Fortran compiler from +https://r.research.att.com/libs/gfortran-4.8.2-darwin13.tar.bz2.54 Apple’s builds of clang +currently have no OpenMP support. +

+

The CRAN ‘Snow Leopard’ distribution of R was built using +

+ + +

To use these, have in config.site something like +

+
+
CC="llvm-gcc-4.2"
+CXX="llvm-g++-4.2"
+F77="gfortran-4.2 -arch x86_64"
+FC=$F77
+OBJC="clang"
+
+ +

Full names help to ensure that the intended compilers are used. In +particular gcc is a copy of llvm-gcc-4.2 for Xcode < +5 but of clang in Xcode 5. The recommended Fortran compiler +defaults to 32-bit, so -arch x86_64 is needed. (For a 32-bit +build, use -arch i386 for all compiler commands.) +

+

The OpenMP support in this version of gcc is problematic, so +the CRAN build is configured with --disable-openmp. +

+

Other builds of gfortran are available: see +https://gcc.gnu.org/wiki/GFortranBinaries and +http://coudert.name/software.html. To use one of these with a +binary distribution of R you will probably need to specify the name +or path in a personal or site Makevars file (see Customizing package compilation). +

+

More recent and complete distributions of clang are usually +available from http://llvm.org/releases/. In particular, these +include support for the ‘Address Sanitizer’ (not included by Apple until +Xcode 7) and for OpenMP55 in version 3.7.0 and later. +

+

Pre-compiled versions of many of the Useful libraries and programs +are available from https://r.research.att.com/libs/. You will +most likely want at least pcre, xz, jpeg +and readline (and perhaps tiff). +pkg-config is not provided by Apple and useful for many packages: +it will also be used if present when configuring the X11() +device. +

+

Recent versions of OS X ship with zlib version 1.2.8 and +bzlib version 1.0.6, sufficient for the default +configure checks. Mavericks has a recent enough version of +libcurl: Snow Leopard does not. +

+

Support for cairo (without Pango) can be enabled in two +ways: both need pkg-config available. XQuartz ships cairo +and its version will be selected if its pkg-config files are +first on the configuration path: for example by setting +

+
export PKG_CONFIG_PATH=/opt/X11/lib/pkgconfig:/usr/local/lib/pkgconfig:/usr/lib/pkgconfig
+
+

or appending that variable to the configure command. +Otherwise the binary libraries at +https://r.research.att.com/libs/ can be used: cairo, +fontconfig, freetype, pixman and +pkgconfig-system-stubs-darwin13.tar.gz are needed, plus +libpng for PNG support. +

+ + +

The Accelerate library can be used via the configuration options +

+
+
--with-blas="-framework Accelerate" --with-lapack
+
+ +

to provide potentially higher-performance versions of the BLAS +and LAPACK routines. (Use of Accelerate with +--with-lapack does not work on Snow Leopard: it may work there +without.)56 +

+

Looking at the top of +/Library/Frameworks/R.framework/Resources/etc/Makeconf +will show the compilers and configuration options used for the +CRAN binary package for R: at the time of writing +

+
--enable-memory-profiling
+
+

was used for ‘Mavericks’. +

+

Configure option --with-internal-tzcode is the default on OS X, +as the system implementation of time zones does not work correctly for +times before 1902 or after 2037 (despite using a 64-bit time_t). +

+

The TeX implementation used by the developers is MacTeX +(https://www.tug.org/mactex/): the full installation is about +4GB, but a smaller version is available at +https://www.tug.org/mactex/morepackages.html: you will need to +add some packages, e.g. for the 2015 version we needed to add +cm-super, helvetic, inconsolata and texinfo +which brought this to about 410MB. ‘TeX Live Utility’ (available +via the MacTeX front page) provides a graphical means to manage +TeX packages. +

+

One OS X quirk is that the default path has /usr/local/bin after +/usr/bin, contrary to common practice on Unix-alikes. This means +that if you install tools from the sources they will by default be +installed under /usr/local and not supersede the system +versions. +

+

If you upgrade your OS you should re-install the ‘Command Line Tools’ +and may need to re-install XQuartz and Java (this has been needed for +some upgrades but not others). +

+ + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: OS X   [Contents][Index]

+
+ +

C.3.1 Mavericks and later

+ +

For these versions Apple makes available compilers based on +clang, and C++ headers and runtime are from LLVM’s +‘libc++’ project, as part of the ‘Command Line Tools’ (sometimes +called ‘Command Line Developer Tools’) and of Xcode (you only need one +or the other). +

+

These tools can be (re-)installed by xcode-select --install. +(If you have a fresh installation of Mavericks or later, running e.g. +make in a terminal will offer the installation of the +command-line tools, or perhaps use the versions from Xcode. However, +after an OS update, you are advised to re-install them.) +

+

To use the compilers from the command-line tools with the recommended +Fortran compiler, have in config.site something like +

+
+
CC=clang
+CXX=clang++
+F77=gfortran-4.8
+FC=$F77
+OBJC=clang
+
+ +

More recent and complete distributions of clang are usually +available from http://llvm.org/releases/. In particular, these +include support for the ‘Address Sanitizer’ (not included by Apple until +Xcode 7) and for OpenMP in versions 3.7.0 and later. +

+ +

See the comments under Mountain Lion about X11 and GTK. +

+

If you upgrade the OS you should re-install any of XQuartz, the ‘Command +Line Tools’ and Java which you have installed. (Upgrading may partially +remove previous versions which can be confusing.) +

+

There are some warnings using the recommended gfortran build +under Yosemite. +

+ +
+ +
+

+Next: , Previous: , Up: OS X   [Contents][Index]

+
+ +

C.3.2 Lion and Mountain Lion

+ +

‘Command-line Tools for Xcode’ used to be part of the Apple Developer +Tools (‘Xcode’) but for these versions needs to be installed separately. +They can be downloaded from +http://developer.apple.com/devcenter/mac/ (you will need to +register there: that allows you to download older versions available for +your OS) or from within some versions of Xcode you can install the +command-line tools from the ‘Downloads’ pane in the +‘Preferences’. +

+

The X11 system used with Mountain Lion is XQuartz (see above): Lion +included an X11 system. +

+

To build the graphics devices depending on cairographics, the XQuartz +path for pkg-config files needs to be known to +pkg-config when configure is run: this usually means +adding it to the PKG_CONFIG_PATH environment variable, e.g. +

+
+
export PKG_CONFIG_PATH= \
+  /opt/X11/lib/pkgconfig:/usr/local/lib/pkgconfig:/usr/lib/pkgconfig
+
+ +

or putting +

+
PKG_CONFIG_PATH=/opt/X11/lib/pkgconfig:/usr/local/lib/pkgconfig:/usr/lib/pkgconfig
+
+ +

in config.site. +

+

For some pre-compiled software, for example the GTK framework, +/opt/X11/include may need to be added to the include paths. +

+

If you install the command-line tools for Xcode 4.6.3 you will get the +compilers used for the CRAN binary distribution: those for Xcode 5 can +be installed afterwards. +

+ +
+ +
+

+Next: , Previous: , Up: OS X   [Contents][Index]

+
+ +

C.3.3 Snow Leopard

+ +

A quirk on Snow Leopard is that the X11 libraries are not in the default +linking path, so something like ‘LIBS=-L/usr/X11/lib’ may be +required in config.site, or you can use the configure +options --x-includes=/usr/X11/include +--x-libraries=/usr/X11/lib . +

+

The CRAN binaries were built using Xcode 4.2, a version +available only to subscribing developers. It is believed that 3.2.6 (the +last public free version for Snow Leopard) will work. +

+
+ +
+

+Next: , Previous: , Up: OS X   [Contents][Index]

+
+ +

C.3.4 El Capitan

+ + +

El Capitan was released at the end of September 2015, and experience to +date is with systems which have been updated from Yosemite or earlier. +Upgraded systems need the Command Line Tools reinstalled. +

+

There are problems resulting from the new-to-El-Capitan restriction that +only Apple is allowed to install software under /usr: this +affects inter alia MacTeX and XQuartz. For +MacTeX it is necessary to include /Library/TeX/texbin in +your path rather than /usr/texbin. Upgrading will move +disallowed files to under /Library/SystemMigration/usr: this +includes /usr/X11R6, /usr/texbin, /usr/bin/R, +/usr/bin/Rscript but not the link /usr/X11. +

+

configure can be told to look for X11 in +XQuartz’s main location of /opt/X11, e.g. by +

+
--x-includes=/opt/X11/include --x-libraries=/opt/X11/lib
+
+ +

although the linked versions under /usr/X11 will be found (if the +link is present). +

+ +
+ +
+

+Next: , Previous: , Up: OS X   [Contents][Index]

+
+ +

C.3.5 Tcl/Tk headers and libraries

+ +

If you plan to use the tcltk package for R, you need to +install a distribution of Tcl/Tk. There are two alternatives. If you +use R.APP you will want to use X11-based Tcl/Tk (as used on other +Unix-alikes), which is installed as part of the CRAN binary for R. +This may need +

+
-with-tcltk=/usr/local/lib
+
+

or +

+
--with-tcl-config=/usr/local/lib/tclConfig.sh 
+--with-tk-config=/usr/local/lib/tkConfig.sh
+
+

Note that this requires a fully-updated X11 installation (XQuartz for +Mountain Lion and later). +

+

There is also a native (‘Aqua’) version of Tcl/Tk which produces widgets +in the native OS X style: this will not work with R.APP because of +conflicts over the OS X menu, but for those only using command-line R +this provides a much more intuitive interface to Tk for experienced Mac +users. Most versions of OS X come with Aqua Tcl/Tk libraries, but these +are not recent versions of Tcl/Tk (8.5.9 in Mountain Lion and later). +It is better to install Tcl/Tk 8.6.x from the sources or a binary +distribution from +https://www.activestate.com/activetcl/downloads. Configure R +with +

+
--with-tcl-config=/Library/Frameworks/Tcl.framework/tclConfig.sh 
+--with-tk-config=/Library/Frameworks/Tk.framework/tkConfig.sh
+
+ +

(for the versions bundled with OS X, use paths starting with +/System/Library). +

+

If you need to find out which distribution of Tk is in use at run time, +use +

+
library(tcltk)
+tclvalue(.Tcl("tk windowingsystem"))  # "x11" or "aqua"
+
+ +
+ +
+

+Next: , Previous: , Up: OS X   [Contents][Index]

+
+ +

C.3.6 Java

+ +

The situation with Java support on OS X is messy.57 +

+

Snow Leopard and Lion shipped with a Java 6 runtime (JRE). Mountain +Lion and later do not come with an installed JRE, and an OS X upgrade +removes one if already installed: it is intended to be installed at +first use. Check if a JRE is installed by running java +-version in a Terminal window: if Java is not installed this +should prompt you to install it. You can also install directly the +latest Java from Oracle (currently from +http://www.oracle.com/technetwork/java/javase/downloads/index.html). +

+

You may need to install what Apple calls ‘legacy Java’58 to suppress pop-up messages +even if you have a current version installed. +

+

To see what compatible versions of Java are currently installed, run +/usr/libexec/java_home -V -a x86_64. If needed, set the +environment variable JAVA_HOME to choose between these, both when +R is built from the sources and when R CMD javareconf is +run. +

+

Configuring and building R both looks for a JRE and for support for +compiling JNI programs (used by packages rJava and +JavaGD); the latter requires a JDK (Java SDK) and not just a +JRE. +

+

The build process tries to fathom out what JRE/JDK to use, but it may +need some help, e.g. by setting JAVA_HOME. An Apple JRE can be +specified explicitly by something like +

+
JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Home
+JAVA_CPPFLAGS="-I/System/Library/Frameworks/JavaVM.framework/Headers"
+JAVA_LD_LIBRARY_PATH=
+JAVA_LIBS="-framework JavaVM"
+
+

The Oracle JDK can be specified explicitly by something like +

+
JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_51.jdk/Contents/Home
+JAVA_CPPFLAGS="-I/${JAVA_HOME}/include -I/${JAVA_HOME}/include/darwin"
+JAVA_LD_LIBRARY_PATH="${JAVA_HOME}/jre/lib/server"
+JAVA_LIBS="-L/${JAVA_HOME}/jre/lib/server -ljvm" 
+
+

in config.site. +

+

Note that it is necessary to set the environment variable NOAWT +to 1 to install many of the Java-using packages. +

+ +
+ +
+

+Next: , Previous: , Up: OS X   [Contents][Index]

+
+ +

C.3.7 Frameworks

+ +

The CRAN build of R is installed as a framework, which is +selected by the default option +

+
+
./configure --enable-R-framework
+
+ +

(This is intended to be used with an Apple toolchain: other compilers may +not support frameworks correctly.) +

+

It is only needed if you want to build R for use with the R.APP +console, and implies --enable-R-shlib to build R as a +dynamic library. This option configures R to be built and installed +as a framework called R.framework. The default installation path +for R.framework is /Library/Frameworks but this can be +changed at configure time by specifying the flag +--enable-R-framework[=DIR] or at install time as +

+
+
make prefix=/where/you/want/R.framework/to/go install
+
+ +

Note that installation as a framework is non-standard (especially to a +non-standard location) and Unix utilities may not support it (e.g. the +pkg-config file libR.pc will be put somewhere unknown +to pkg-config). +

+
+ +
+

+Previous: , Up: OS X   [Contents][Index]

+
+ +

C.3.8 Building R.app

+ +

Note that building the R.APP GUI console is a separate project, using +Xcode. Before compiling R.APP make sure the current version of R +is installed in /Library/Frameworks/R.framework and working at +the command-line (this can be a binary install). +

+

The current sources can be checked out by +

+
svn co https://svn.r-project.org/R-packages/trunk/Mac-GUI
+
+

and built by loading the R.xcodeproj project (select the +R target and a suitable configuration), or from the command-line +by e.g. +

+
xcodebuild -target R -configuration Release
+
+

See also the INSTALL file in the checkout or directly at +https://svn.r-project.org/R-packages/trunk/Mac-GUI/INSTALL. +

+

R.APP does not need to be installed in any specific way. Building +R.APP results in the R.APP bundle which appears as one R icon. This +application bundle can be run anywhere and it is customary to place it +in the /Applications folder. +

+ +
+ +
+

+Next: , Previous: , Up: Platform notes   [Contents][Index]

+
+ +

C.4 Solaris

+ + +

R has been built successfully on Solaris 10 (both Sparc and +‘x86’) using the (zero cost) Oracle Solaris Studio compilers: +there has been some success with +gcc 4/gfortran. (Recent Sun machines are AMD +Opterons or Intel Xeons (‘amd64’) rather than ‘x86’, but +32-bit ‘x86’ executables are the default.) +

+

There have been few reports on Solaris 11, with no known extra issues. +Solaris 9 and earlier are now so old that it is unlikely that R is +still used with them, and they will not be considered here. +

+

The Solaris versions of several of the tools needed to build R +(e.g. make, ar and ld) are in +/usr/ccs/bin, so if using those tools ensure this is in your +path. A version of the preferred GNU tar is (if +installed) in /usr/sfw/bin. It may be necessary to avoid the +tools in /usr/ucb: POSIX-compliant versions of some tools can be +found in /usr/xpg4/bin and /usr/xpg6/bin. +

+

A large selection of Open Source software can be installed from +https://www.opencsw.org, by default installed under +/opt/csw. Solaris 10 ships with bzlib version 1.0.6 +(sufficient for the default --with-system-bzlib) but +zlib version 1.2.3 (too old for --with-system-zlib): +OpenCSW has 1.2.8. +

+

You will need GNU libiconv and readline: the +Solaris version of iconv is not sufficiently powerful. +

+

The native make suffices to build R but a small number of +packages require GNU make (some without good reason +and without declaring it as ‘SystemRequirements’ in the +DESCRIPTION file). +

+

Some people have reported that the Solaris libintl needs to be +avoided, for example by using --disable-nls or +--with-included-gettext or using libintl from OpenCSW. +(On the other hand, there have been many successful installs which +automatically detected libintl from OpenCSW or selected the +included gettext.) +

+

The support for the C99 long double type on Sparc hardware uses +quad-precision arithmetic, and this is usually slow because it is done +by software emulation. On such systems the configure option +--disable-long-double can be used for faster but less accurate +computations. +

+

The Solaris time-zone conversion services seem to be unreliable pre-1916 +in Europe (when daylight-savings time was first introduced): most often +reporting in the non-existent DST variant. Using configure +option --with-internal-tzcode is recommended, and required if +you find time-zone abbreviations being given odd values (as has been +seen on 64-bit builds without it). +

+

When using the Oracle compilers59 do not specify -fast, as this +disables IEEE arithmetic and make check will fail. +

+

It has been reported that some Solaris installations need +

+
+
INTERNET_LIBS="-lsocket -lnsl"
+
+ +

on the configure command line or in file config.site; +however, there have been many successful installs without this. +

+

A little juggling of paths was needed to ensure GNU +libiconv (in /usr/local) was used rather than the Solaris +iconv: +

+
+
CC="cc -xc99"
+CFLAGS="-O -xlibmieee"
+F77=f95
+FFLAGS=-O
+CXX="CC -library=stlport4"
+CXXFLAGS=-O
+FC=f95
+FCFLAGS=$FFLAGS
+FCLIBS="-lfai -lfsu"
+R_LD_LIBRARY_PATH="/usr/local/lib:/opt/csw/gcc4/lib:/opt/csw/lib"
+
+ +

For a 64-bit target add -m64 to the compiler macros +and use something like LDFLAGS=-L/usr/local/lib/sparcv9 or +LDFLAGS=-L/usr/local/lib/amd64 as appropriate. +It will also be necessary to point pkg-config at the 64-bit +directories, e.g. one of +

+
+
PKG_CONFIG_PATH=/opt/csw/lib/amd64/pkgconfig:/usr/lib/amd64/pkgconfig
+PKG_CONFIG_PATH=/opt/csw/lib/sparcv9/pkgconfig:/usr/lib/sparcv9/pkgconfig
+
+ +

and to specify a 64-bit Java VM by e.g. +

+
+
JAVA_CPPFLAGS="-I${JAVA_HOME}/../include -I${JAVA_HOME}/../include/solaris"
+JAVA_LD_LIBRARY_PATH=${JAVA_HOME}/lib/amd64/server
+JAVA_LIBS="-L${JAVA_HOME}/lib/amd64/server \
+  -R${JAVA_HOME}/lib/amd64/server -ljvm"
+
+

With Solaris Studio 12.[23] on Sparc, FCLIBS needs to be +

+
+
FCLIBS="-lfai -lfai2 -lfsu"
+
+ +

(and possibly other Fortran libraries, but this suffices for the +packages currently on CRAN). +

+

Currently ‘amd64’ and ‘sparcv9’ builds work +out-of-the-box with Sun Studio 12u1 but not Solaris Studio 12.2 and +12.3: libRblas.so and lapack.so are generated with code +that causes relocation errors (which is being linked in from the Fortran +libraries). This means that building 64-bit R as a shared library +may be impossible with Solaris Studio >= 12.2. For a standard build the +trick seems to be to manually set FLIBS to avoid the troublesome +libraries. For example, on ‘amd64’ set in config.site +something like +

+
+
FLIBS_IN_SO="-R/opt/solarisstudio12.3/lib/amd64
+  /opt/solarisstudio12.3/lib/amd64/libfui.so
+  /opt/solarisstudio12.3/lib/amd64/libfsu.so"
+
+

For 64-bit Sparc, set in config.site something like +

+
FLIBS="-R/opt/solarisstudio12.3/prod/lib/sparc/64
+ -lifai -lsunimath -lfai -lfai2 -lfsumai -lfprodai -lfminlai -lfmaxlai
+ -lfminvai -lfmaxvai -lfui -lsunmath -lmtsk
+ /opt/solarisstudio12.3/prod/lib/sparc/64/libfsu.so.1"
+
+ +

By default the Solaris Studio compilers do not by default conform to the C99 +standard (appendix F 8.9) on the return values of functions such as +log: use -xlibmieee to ensure this. +

+

You can target specific Sparc architectures for (slightly) higher +performance: -xtarget=native (in CFLAGS etc) tunes the +compilation to the current machine. +

+

Using -xlibmil in CFLAGS and -xlibmil in +FFLAGS allows more system mathematical functions to be inlined. +

+ +

On ‘x86’ you will get marginally higher performance via +

+
+
CFLAGS="-xO5 -xc99 -xlibmieee -xlibmil -nofstore -xtarget=native"
+FFLAGS="-O5 -libmil -nofstore -xtarget=native"
+CXXFLAGS="-xO5 -xlibmil -nofstore -xtarget=native"
+SAFE_FFLAGS="-libmil -fstore -xtarget=native"
+
+ +

but the use of -nofstore can be less numerically stable, and some +packages (notably mgcv on ‘x86’) failed to compile at +higher optimization levels with version 12.3. +

+

The Solaris Studio compilers provide several implementations of the +C++98 standard which select both the set of headers and a C++ runtime +library. These are selected by the -library flag, which as it +is needed for both compiling and linking is best specified as part of +the compiler. The examples above use ‘stlport4’, currently the +most modern of the options: the default (but still needed to be +specified as it is needed for linking) is ‘Cstd’: see +http://www.oracle.com/technetwork/server-storage/solaris/cmp-stlport-libcstd-142559.html. +Note though that most external Solaris C++ libraries will have been +built with ‘Cstd’ and so an R package using such libraries also +needs to be. Occasionally the option -library=stlport4,Crun +has been needed. +

+

Several CRAN packages using C++ need the more liberal +interpretation given by adding +

+
+
CXXFLAGS="-features=tmplrefstatic"
+
+ + + + + +

The performance library sunperf is available for use with the +Solaris Studio compilers. If selected as a BLAS, it must also +be selected as LAPACK via (for Solaris Studio 12.2 and later) +

+
+
./configure --with-blas='-library=sunperf' --with-lapack
+
+ +

This has often given test failures in the past, in several different +places. At the time of writing it fails in tests/reg-BLAS.R, and on +some builds, including for ‘amd64’, it fails in +example(eigen). +

+

Parsing very complex R expressions needs a lot of stack space when +the Oracle compilers are used: several packages require the stack +increased to at least 20MB. +

+ + + + + +
+ +
+

+Previous: , Up: Solaris   [Contents][Index]

+
+ +

C.4.1 Using gcc

+ +

If using gcc, ensure that the compiler was compiled for the +version of Solaris in use. (This can be ascertained from gcc +-v.) gcc makes modified versions of some header files, and +several reports of problems were due to using gcc compiled on +one version of Solaris on a later version. +

+

The notes here are for gcc set up to use the Solaris linker: +it can also be set up to use GNU ld, but that has not been +tested. +

+

Compilation for a 32-bit Sparc target with gcc 4.9.2 +needed +

+
+
CPPFLAGS=-I/opt/csw/include
+LDFLAGS="-L/opt/csw/gcc4/lib -L/opt/csw/lib"
+
+ +

and for a 64-bit Sparc target +

+
CC="gcc -m64"
+F77="gfortran -m64"
+CXX="g++ -m64"
+FC=$F77
+CPPFLAGS=-I/opt/csw/include
+LDFLAGS="-L/opt/csw/gcc4/lib/sparcv9 -L/opt/csw/lib/sparcv9"
+
+ +

Note that paths such as /opt/csw/gcc4/lib/sparcv9 may need to +be in the + +LD_LIBRARY_PATH during configuration. +

+

The compilation can be tuned to a particular cpu: the CRAN +check system uses -mtune=niagara2. +

+

Compilation for an ‘x86’ target with gcc 4.9.2 +needed +

+
+
CC="/opt/csw/gcc4/bin/gcc -m32"
+CPPFLAGS="-I/opt/csw/include -I/usr/local/include"
+F77="/opt/csw/gcc4/bin/gfortran -m32"
+CXX="/opt/csw/gcc4/bin/g++ -m32"
+FC="/opt/csw/gcc4/bin/gfortran -m32"
+LDFLAGS="-L/opt/csw/gcc4/lib -L/opt/csw/lib -L/usr/local/lib"
+
+ +

(-L/opt/csw/lib is needed since TeXLive was built using +32-bit gcc, and we need /opt/csw/lib in +R_LD_LIBRARY_PATH.) +

+

For an ‘amd64’ target with gcc 4.9.2 +we used +

+
+
CC="/opt/csw/gcc4/bin/gcc -m64"
+CPPFLAGS="-I/opt/csw/include -I/usr/local/include"
+F77="/opt/csw/gcc4/bin/gfortran -m64"
+FPICFLAGS=-fPIC
+CXX="/opt/csw/gcc4/bin/g++ -m64"
+FC=$F77
+FCPICFLAGS=$FPICFLAGS
+LDFLAGS="-L/opt/csw/gcc4/lib/amd64 -L/opt/csw/lib/amd64"
+
+ +
+ +
+

+Next: , Previous: , Up: Platform notes   [Contents][Index]

+
+ +

C.5 AIX

+ + +

We no longer support AIX prior to 4.2, and configure will +throw an error on such systems. +

+

Ei-ji Nakama was able to build under AIX 5.2 on ‘powerpc’ with +GCC 4.0.3 in several configurations. 32-bit versions could be +configured with --without-iconv as well as +--enable-R-shlib. For 64-bit versions he used +

+
+
OBJECT_MODE=64
+CC="gcc -maix64"
+CXX="g++ -maix64"
+F77="gfortran -maix64"
+FC="gfortran -maix64"
+
+ +

and was also able to build with the IBM xlc and Hitachi +f90 compilers by +

+
+
OBJECT_MODE=64
+CC="xlc -q64"
+CXX="g++ -maix64"
+F77="f90 -cpu=pwr4 -hf77 -parallel=0 -i,L -O3 -64"
+FC="f90 -cpu=pwr4 -hf77 -parallel=0 -i,L -O3 -64"
+FLIBS="-L/opt/ofort90/lib -lhf90vecmath -lhf90math -lf90"
+
+ +

Some systems have f95 as an IBM compiler that does not by +default accept FORTRAN 77. It needs the flag -qfixed=72, or to +be invoked as xlf_r. +

+

The AIX native iconv does not support encodings ‘latin1’ nor +‘""’ and so cannot be used. (As far as we know GNU +libiconv could be installed.) +

+

Fan Long reported success on AIX 5.3 using +

+
+
OBJECT_MODE=64
+LIBICONV=/where/libiconv/installed
+CC="xlc_r -q64" 
+CFLAGS="-O -qstrict"
+CXX="xlC_r -q64"
+CXXFLAGS="-O -qstrict"
+F77="xlf_r -q64"
+AR="ar -X64"
+CPPFLAGS="-I$LIBICONV/include -I/usr/lpp/X11/include/X11"
+LDFLAGS="-L$LIBICONV/lib -L/usr/lib -L/usr/X11R6/lib"
+
+ + +

On one AIX 6.x system it was necessary to use R_SHELL to set the +default shell to be Bash rather than Zsh. +

+

Kurt Hornik and Stefan Theussl at WU (Wirtschaftsuniversität Wien) +successfully built R on a ‘powerpc’ (8-CPU Power6 +system) running AIX 6.1, configuring with or without +--enable-R-shlib (Ei-ji Nakama’s support is gratefully +acknowledged). +

+

It helps to describe the WU build environment first. A small part of +the software needed to build R and/or install packages is available +directly from the AIX Installation DVDs, e.g., Java 6 and X11. +Additional open source software (OSS) is packaged for AIX in .rpm +files and available from both IBM’s “AIX Toolbox for Linux +Applications” +(http://www-03.ibm.com/systems/power/software/aix/linux/) and +http://www.oss4aix.org/download/. The latter website typically +offers more recent versions of the available OSS. All tools needed and +libraries downloaded from these repositories (e.g., GCC, Make, +libreadline, etc.) are typically installed to +/opt/freeware, hence corresponding executables are found in + +/opt/freeware/bin which thus needs to be in PATH for using +these tools. As on other Unix systems one needs GNU +libiconv as the AIX version of iconv is not sufficiently +powerful. Additionally, for proper Unicode compatibility one should +install the corresponding package from the ICU project +(http://www.icu-project.org/download/), which offers pre-compiled +binaries for various platforms which in case of AIX can be installed via +unpacking the tarball to the root file system. For full LaTeX +support one can install the TeX Live DVD distribution +(https://www.tug.org/texlive/): it is recommended to update the +distribution using the tlmgr update manager. For 64-bit R builds +supporting Tcl/Tk this needs to installed from the sources as available +pre-compiled binaries supply only 32-bit shared objects. +

+

The recent WU testing was done using compilers from both the +GNU Compiler Collection (version 4.2.4) which is available +from one of the above OSS repositories, and the IBM C/C++ (XL C/C++ +10.01) as well as FORTRAN (XL Fortran 12.01) compilers +(http://www14.software.ibm.com/webapp/download/byproduct.jsp#X). +

+

To compile for a 64-bit ‘powerpc’ (Power6 CPU) target +one can use +

+
+
CC ="gcc -maix64 -pthread"
+CXX="g++ -maix64 -pthread"
+FC="gfortran -maix64 -pthread"
+F77="gfortran -maix64 -pthread"
+CFLAGS="-O2 -g -mcpu=power6"
+FFLAGS="-O2 -g -mcpu=power6"
+FCFLAGS="-O2 -g -mcpu=power6"
+
+ +

for the GCC and +

+
+
CC=xlc
+CXX=xlc++
+FC=xlf
+F77=xlf
+CFLAGS="-qarch=auto -qcache=auto -qtune=auto -O3 -qstrict -ma"
+FFLAGS="-qarch=auto -qcache=auto -qtune=auto -O3 -qstrict"
+FCFLAGS="-qarch=auto -qcache=auto -qtune=auto -O3 -qstrict"
+CXXFLAGS="-qarch=auto -qcache=auto -qtune=auto -O3 -qstrict"
+
+ +

for the IBM XL compilers. For the latter, it is important to note that +the decision for generating 32-bit or 64-bit code is done by setting the + +OBJECT_MODE environment variable appropriately (recommended) or +using an additional compiler flag (-q32 or -q64). By +default the IBM XL compilers produce 32 bit code. Thus, to build R with +64-bit support one needs to either export OBJECT_MODE=64 in the +environment or, alternatively, use the -q64 compiler options. +

+

It is strongly recommended to install Bash and use it as the configure +shell, e.g., via setting CONFIG_SHELL=/usr/bin/bash in the +environment, and to use GNU Make (e.g., via +(MAKE=/opt/freeware/bin/make). +

+

Further installation instructions to set up a proper R development +environment can be found in the “R on AIX” project on R-Forge +(https://R-Forge.R-project.org/projects/aix/). +

+
+ +
+

+Next: , Previous: , Up: Platform notes   [Contents][Index]

+
+ +

C.6 FreeBSD

+ + +

There have been few recent reports on FreeBSD. +

+

There is a ‘port’ at https://www.freebsd.org/ports/math.html, for +R 3.0.2 at the time of writing. Davor Cubranic reported some success +on x86_64 FreeBSD 10.2 for R 3.2.2. +

+

It appears that versions of FreeBSD using clang as the compiler +(the default as from 10.0) need +

+
MAIN_LDFLAGS=-Wl,--export-dynamic
+
+

for R releases up to 3.2.2. +

+

Use of ICU for collation and the configure option +--with-internal-tzcode are desirable workarounds. +

+ +

C.7 OpenBSD

+ + +

Ingo Feinerer installed R version 3.2.2 on OpenBSD 5.8 arch +‘amd64’ (their name for ‘x86_64’). Details of the build +(and patches applied) are at +http://cvsweb.openbsd.org/cgi-bin/cvsweb/ports/math/R/. +

+
+ +
+

+Next: , Previous: , Up: Platform notes   [Contents][Index]

+
+ +

C.8 Cygwin

+ +

The Cygwin emulation layer on Windows can be treated as a Unix-alike OS. +This is unsupported, but experiments have been conducted and a few +workarounds added. Cygwin has not been tested for R 3.0.0 or later. +

+

The 64-bit version is completely unsupported. The 32-bit version has +never worked well enough to pass R’s make check. +

+

R requires C99 complex type support, which is available as from +Cygwin 1.7.8 (March 2011). However, the (then) implementation of +cacos gave incorrect results, so we undefine HAVE_CACOS +in src/main/complex.c on that platform. It has been reported +that some C99 long double mathematical functions are missing, so +configuring with --disable-long-double was required. +

+

Only building as a shared library can possibly work,60 so use e.g. +

+
+
./configure --disable-nls --enable-R-shlib FLIBS=-lgfortran
+make
+
+ +

Enabling NLS does work if required, although adding +--with-included-gettext is preferable. You will see many +warnings about the use of auto-import. Setting ‘FLIBS’ explicitly +seems needed currently as the auto-detection gives an incorrect value. +

+

You will need the tetex-extra Cygwin package to build +NEWS.pdf and the vignettes. +

+

Note that this gives you a command-line application using readline +for command editing. The ‘X11’ graphics device will work if a +suitable X server is running, and the standard Unix-alike ways of +installing source packages work. There was a bug in the +/usr/lib/tkConfig.sh script in the version we looked at, which +needs to have +

+
+
TK_LIB_SPEC='-ltk84'
+
+ +

The overhead of using shell scripts makes this noticeably slower than a +native build of R on Windows. +

+

Even when R could be built, not all the tests passed: there were +incorrect results from wide-character regular expressions code and from +sourcing CR-delimited files. +

+

Do not use Cygwin’s BLAS library: it is known to give incorrect results. +

+
+ +
+

+Previous: , Up: Platform notes   [Contents][Index]

+
+ +

C.9 New platforms

+ +

There are a number of sources of problems when installing R on a new +hardware/OS platform. These include +

+

Floating Point Arithmetic: R requires arithmetic compliant +with IEC 60559, also known as IEEE 754. +This mandates the use of plus and minus infinity and NaN (not a +number) as well as specific details of rounding. Although almost all +current FPUs can support this, selecting such support can be a pain. +The problem is that there is no agreement on how to set the signalling +behaviour; Sun/Sparc, SGI/IRIX and ‘ix86’ Linux require no +special action, FreeBSD requires a call to (the macro) +fpsetmask(0) and OSF1 required that computation be done with a +-ieee_with_inexact flag etc. On a new platform you must find +out the magic recipe and add some code to make it work. This can often +be done via the file config.site which resides in the top level +directory. +

+

Beware of using high levels of optimization, at least initially. On +many compilers these reduce the degree of compliance to the +IEEE model. For example, using -fast on the Solaris +Studio compilers has caused R’s NaN to be set incorrectly, and +gcc’s -ffast-math and clang’s +-Ofast have given incorrect results. +

+

Shared Objects: There seems to be very little agreement +across platforms on what needs to be done to build shared objects. +there are many different combinations of flags for the compilers and +loaders. GNU libtool cannot be used (yet), as it currently +does not fully support FORTRAN: one would need a shell wrapper for +this). The technique we use is to first interrogate the X window system +about what it does (using xmkmf), and then override this in +situations where we know better (for tools from the GNU +Compiler Collection and/or platforms we know about). This typically +works, but you may have to manually override the results. Scanning the +manual entries for cc and ld usually reveals the +correct incantation. Once you know the recipe you can modify the file +config.site (following the instructions therein) so that the +build will use these options. +

+

It seems that gcc 3.4.x and later on ‘ix86’ Linux +defeat attempts by the LAPACK code to avoid computations entirely in +extended-precision registers, so file src/modules/lapack/dlamc.f +may need to be compiled without optimization. Set the configure +variable SAFE_FFLAGS to the flags to be used for this file. If +configure detects GNU FORTRAN it adds flag +-ffloat-store to FFLAGS. (Other settings are needed when +using icc on ‘ix86’ Linux, for example. Using +-mpc64 is preferable on more recent GCC compilers.) +

+

If you do manage to get R running on a new platform please let us +know about it so we can modify the configuration procedures to include +that platform. +

+

If you are having trouble getting R to work on your platform please +feel free to use the ‘R-devel’ mailing list to ask questions. We +have had a fair amount of practice at porting R to new platforms +... +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Appendix D The Windows toolset

+ +

If you want to build R or add-on packages from source in Windows, you +will need to collect, install and test an extensive set of tools. See +https://CRAN.R-project.org/bin/windows/Rtools/ for the current +locations and other updates to these instructions. (Most Windows users +will not need to build add-on packages from source; see Add-on packages for details.) +

+

We have found that the build process for R is quite sensitive to +the choice of tools: please follow our instructions exactly, +even to the choice of particular versions of the tools.61 The build process for add-on packages is somewhat more +forgiving, but we recommend using the exact toolset at first, and only +substituting other tools once you are familiar with the process. +

+

This appendix contains a lot of prescriptive comments. They are +here as a result of bitter experience. Please do not report problems to +the R mailing lists unless you have followed all the prescriptions. +

+

We have collected most of the necessary tools (unfortunately not all, +due to license or size limitations) into an executable installer +named62 Rtools31.exe, +available from https://CRAN.R-project.org/bin/windows/Rtools/. You +should download and run it, choosing the default “Package authoring +installation” to build add-on packages, or the “full installation” if +you intend to build R. +

+

You will need the following items to build R and packages. +See the subsections below for detailed descriptions. +

    +
  • The command line tools (in Rtools*.exe) +
  • The MinGW-w64 32/64-bit toolchain to compile C, Fortran and C++. +
+

For installing simple source packages containing data or R source but +no compiled code, none of these are needed. +

+

A complete build of R including PDF manuals, and producing the +installer will also need the following: +

    +
  • LaTeX +
  • The Inno Setup installer +
  • (optional) qpdf +
+ + +

It is important to set your PATH properly. The installer +Rtools*.exe optionally sets the path to components that it +installs. +

+

Your PATH may include . first, then the bin +directories of the tools, the compiler toolchain and LaTeX. Do not +use filepaths containing spaces: you can always use the short forms +(found by dir /x at the Windows command line). Network shares +(with paths starting \\) are not supported. +

+

For example for a 32-bit build, all on one line, +

+
+
PATH=c:\Rtools\bin;c:\Rtools\gcc-4.6.3\bin;c:\MiKTeX\miktex\bin;
+     c:\R\R-3.2\bin\i386;c:\windows;c:\windows\system32
+
+ +

It is essential that the directory containing the command line tools +comes first or second in the path: there are typically like-named +tools63 in other directories, and they will not +work. The ordering of the other directories is less important, but if in +doubt, use the order above. +

+

Our toolset contains copies of Cygwin DLLs that may conflict with other +ones on your system if both are in the path at once. The normal +recommendation is to delete the older ones; however, at one time we +found our tools did not work with a newer version of the Cygwin DLLs, so +it may be safest not to have any other version of the Cygwin DLLs in your +path. +

+ + + + + + + + +
+ + + +

D.1 LaTeX

+ +

The ‘MiKTeX’ (http://www.miktex.org/) distribution of +LaTeX includes a suitable port of pdftex. This can be set up +to install extra packages ‘on the fly’, which is the simplest way to use +it (and the default). The ‘basic’ version of ‘MiKTeX’ almost +suffices: when last checked packages +

+
+
epsf inconsolata mptopdf url
+
+ +

needed to be added (on the fly or via the ‘MiKTeX’ Package +Manager) to install R. In any case ensure that the inconsolata +package is installed—you can check with the ‘MiKTeX’ Package +Manager. +

+

The Rtools*.exe installer does not include any version of +LaTeX. +

+

It is also possible to use the TeXLive distribution from +https://www.tug.org/texlive/. +

+ +

Please read Making the manuals about how to make fullrefman.pdf +and set the environment variable R_RD4PDF suitably; ensure you +have the required fonts installed or that ‘MiKTeX’ is set up to +install LaTeX packages on first use. +

+
+ + + +

D.2 The Inno Setup installer

+ +

To make the installer package (R-3.2.3-win.exe) we +currently require the Unicode version of Inno Setup 5.3.7 or later from +http://jrsoftware.org/. This is not included in +Rtools*.exe. +

+

Copy file src/gnuwin32/MkRules.dist to +src/gnuwin32/MkRules.local and edit it to set ISDIR to the +location where Inno Setup was installed. +

+
+ + + +

D.3 The command line tools

+ +

This item is installed by the Rtools*.exe installer. +

+ +

If you choose to install these yourself, you will need suitable versions +of at least basename, cat, cmp, comm, +cp, cut, date, diff, du, echo, +expr, gzip, ls, make, makeinfo, +mkdir, mv, rm, rsync, sed, sh, +sort, tar, texindex, touch and uniq; +we use those from the Cygwin distribution +(https://www.cygwin.com/) or compiled from the sources. You will +also need zip and unzip from the Info-ZIP project +(http://www.info-zip.org/). All of these tools are in +Rtools*.exe. +

+ +

Beware: ‘Native’ ports of make are not suitable +(including those called ‘MinGW make’ at the MinGW SourceForge site and +mingw32-make in some MinGW-w64 distributions). There were +also problems with other versions of the Cygwin tools and DLLs. To +avoid frustration, please use our tool set, and make sure it is at the +front of your path (including before the Windows system directories). +If you are using a Windows shell, type PATH at the prompt to find +out. +

+ +

You may need to set the environment variable CYGWIN to a value +including ‘nodosfilewarning’ to suppress messages about +Windows-style paths. +

+
+ + + +

D.4 The MinGW-w64 toolchain

+ +

Technically you need more than just a compiler so the set of tools is +referred to as a ‘toolchain’. +

+

The preferred toolchain is part of Rtools31.exe: this uses a beta +version of gcc 4.6.3 and version 2.0.1 of the MinGW-w64 +project’s runtime. +

+

This toolchain uses multilib: that is there is a single front-end +such as gcc.exe for each of the compilers and 32-bit (the +default) and 64-bit compilation are selected by the flags64 -m32 and -m64 +respectively. The tools are all 32-bit Windows executables and should +be able to run on any current version of Windows—however you do need a +64-bit version of Windows to build 64-bit R as the build process runs +R. +

+

To select a 32-bit or 64-bit build of R, set the options in +MkRules.local appropriately (following the comments in the file). +

+

Some external software libraries will need to be re-compiled under the +new toolchain: especially those providing a C++ interface. Many of +those used by CRAN packages are available from +https://www.stats.ox.ac.uk/pub/Rtools/multilib/. Users +developing packages with Rcpp need to ensure that they use a +version built with exactly the same toolchain as their package: the +recommendation is to build Rcpp from its sources yourself. +

+

There is support for OpenMP and pthreads in this toolchain. As the +performance of OpenMP on Windows is poor for small tasks, it is not used +for R itself. +

+
+ + + +

D.5 Useful additional programs

+ +

The process of making the installer will make use of qpdf to +compact some of the package vignettes, if it is available. Windows +binaries of qpdf are available from +http://sourceforge.net/projects/qpdf/files/. Set the path +to the qpdf installation in file MkRules.local. +

+

Developers of packages will find some of the ‘goodies’ at +https://www.stats.ox.ac.uk/pub/Rtools/goodies useful. +

+

There is a version of the file command that identifies the +type of files, and is used by Rcmd check if available. The +binary distribution is included in Rtools31.exe. +

+

The file xzutils.zip contains the program xz which can +be used to (de)compress files with that form of compression. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Function and variable index

+ +
Jump to:   C +   +I +   +M +   +R +   +U +   +
+ + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

C
configure: Simple compilation
configure: Simple compilation
configure: Installation
configure: Installation
configure: Configuration variables
configure: Using make

I
install.packages: Installing packages

M
make: Using make

R
remove.packages: Removing packages
R_HOME: Simple compilation

U
update.packages: Updating packages

+
Jump to:   C +   +I +   +M +   +R +   +U +   +
+ +
+ + + +

Concept index

+ +
Jump to:   A +   +B +   +F +   +I +   +L +   +M +   +O +   +P +   +R +   +S +   +U +   +V +   +W +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

A
AIX: AIX

B
BLAS library: Linear algebra
BLAS library: OS X
BLAS library: Solaris

F
FORTRAN: Using FORTRAN
FreeBSD: FreeBSD

I
Installation: Installation
Installing under Unix-alikes: Installing R under Unix-alikes
Installing under Windows: Installing R under Windows
Internationalization: Internationalization

L
LAPACK library: LAPACK
LAPACK library: OS X
LAPACK library: Solaris
Libraries: Add-on packages
Libraries, managing: Managing libraries
Libraries, site: Managing libraries
Libraries, user: Managing libraries
Linux: Installing R under Unix-alikes
Linux: Linux
Locale: Internationalization
Locale: Locales
Localization: Internationalization

M
Manuals: Making the manuals
Manuals, installing: Installation

O
Obtaining R: Obtaining R
OpenBSD: FreeBSD
OS X: Installing R under Unix-alikes
OS X: Installing R under OS X
OS X: OS X

P
Packages: Add-on packages
Packages, default: Default packages
Packages, installing: Installing packages
Packages, removing: Removing packages
Packages, updating: Updating packages

R
Repositories: Setting up a package repository

S
Site libraries: Managing libraries
Solaris: Solaris
Sources for R: Getting and unpacking the sources
Subversion: Using Subversion and rsync
Subversion: Essential programs and libraries

U
User libraries: Managing libraries

V
Vignettes: Essential programs and libraries

W
winCairo.dll: Building the cairo devices files

+
Jump to:   A +   +B +   +F +   +I +   +L +   +M +   +O +   +P +   +R +   +S +   +U +   +V +   +W +   +
+ +
+ +
+

+Previous: , Up: Top   [Contents][Index]

+
+ +

Environment variable index

+ +
Jump to:   B +   +C +   +D +   +F +   +J +   +L +   +O +   +P +   +R +   +T +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

B
BLAS_LIBS: BLAS

C
CC: Using FORTRAN
CONFIG_SITE: Configuration variables
CPP: Using FORTRAN
CYGWIN: The command line tools

D
DESTDIR: Installation
DESTDIR: Unix-alike standalone

F
F2C: Using FORTRAN
F2CLIBS: Using FORTRAN
FPICFLAGS: Using FORTRAN

J
JAVA_HOME: Java support

L
LANG: Localization of messages
LANGUAGE: Localization of messages
LANGUAGE: Localization of messages
LAPACK_LIBS: LAPACK
LC_ALL: Localization of messages
LC_COLLATE: Testing a Unix-alike Installation
LC_MESSAGES: Localization of messages
LD_LIBRARY_PATH: Unix-alike standalone
LD_LIBRARY_PATH: ACML
LD_LIBRARY_PATH: Using FORTRAN
LD_LIBRARY_PATH: Compile and load flags
LD_LIBRARY_PATH: Using gcc
LOCAL_SOFT: Windows packages

O
OBJECT_MODE: AIX

P
PAPERSIZE: Setting paper size
PATH: Essential programs and libraries
PATH: Using FORTRAN
PATH: AIX
PATH: The Windows toolset

R
R_ARCH: Sub-architectures
R_ARCH: Sub-architectures
R_BROWSER: Setting the browsers
R_DEFAULT_PACKAGES: Default packages
R_DISABLE_HTTPD: Help options
R_GSCMD: Useful libraries and programs
R_INSTALL_TAR: Windows packages
R_JAVA_LD_LIBRARY_PATH: Java support
R_JAVA_LD_LIBRARY_PATH: Java support
R_LIBS: Add-on packages
R_LIBS_SITE: Managing libraries
R_LIBS_USER: Managing libraries
R_PAPERSIZE: Making the manuals
R_PAPERSIZE: Running R
R_PAPERSIZE: Setting paper size
R_PAPERSIZE: Making manuals
R_PDFVIEWER: Setting the browsers
R_RD4PDF: Making the manuals
R_RD4PDF: Making manuals
R_RD4PDF: LaTeX
R_SHELL: AIX
R_USER: Running R

T
TAR: Essential programs and libraries
TAR_OPTIONS: Getting and unpacking the sources
TAR_OPTIONS: Getting the source files
TEMP: Running R
TMP: Running R
TMPDIR: Simple compilation
TMPDIR: Building the core files
TMPDIR: Running R
TMPDIR: Running R
TMPDIR: Installing packages

+
Jump to:   B +   +C +   +D +   +F +   +J +   +L +   +O +   +P +   +R +   +T +   +
+ + +
+
+

Footnotes

+ +

(1)

+

e.g. GNU +tar version 1.15 or later, or that from the ‘libarchive’ +(as used on OS X versions 10.6 and later) or ‘Heirloom Toolchest’ +distributions.

+

(2)

+

for some Subversion clients +‘http:’ may appear to work, but requires continual redirection.

+

(3)

+

Most aspects will work with +paths containing spaces, but external software used by R, e.g. +texi2dvi version 4.8, may not.

+

(4)

+

which use lib rather than +lib64 for their primary 64-bit library directories.

+

(5)

+

Instructions on how to install the latest +version are at +https://www.ctan.org/tex-archive/fonts/inconsolata/.

+

(6)

+

on a +Unix-alike, ‘inconsolata’ is omitted if not found by +configure.

+

(7)

+

This will be needed if more than one +sub-architecture is to be installed.

+

(8)

+

with possible values +‘i386’, ‘x64’, ‘32’ and ‘64’.

+

(9)

+

mainly on RedHat and Fedora, whose layout is described +here.

+

(10)

+

How to prepare such a directory is described in file +src/extra/tzone/Notes in the R sources.

+

(11)

+

for example, -fopenmp, -xopenmp or +-qopenmp. This includes for 2015 versions of clang +and the Intel C compiler.

+

(12)

+

Suitable distributions include +Strawberry Perl, http://strawberryperl.com/ and ActivePerl, +https://www.activestate.com/activeperl.

+

(13)

+

for R 3.2.1 and earlier, the installer will +attempt unsuccessfully to install R and Rscript in +/usr/bin.

+

(14)

+

There was for R 3.2.1 but not for R 3.2.2.

+

(15)

+

The installer as +from R 3.2.2 puts links to R and Rscript in +/usr/bin (Mavericks, Yosemite) or /usr/local/bin (El +Capitan and later). If these are missing, you can run directly the +versions in /Library/Frameworks/R.framework/Resources/.

+

(16)

+

unless they were excluded in the build.

+

(17)

+

its binding is locked once the startup files have been +read, so users cannot easily change it.

+

(18)

+

If a proxy needs to be set, see +?download.file.

+

(19)

+

for a small number of +CRAN packages where this is known to be safe and is needed by +the autobuilder this is the default. Look at the source of +tools:::.install_packages for the list. It can also be specified +in the package’s DESCRIPTION file.

+

(20)

+

or by adding it in +a file such as etc/i386/Makevars.site, which does not exist by +default.

+

(21)

+

‘X/Open Portability Guide’, which has +had several versions.

+

(22)

+

On some systems setting +LC_ALL or LC_MESSAGES to ‘C’ disables LANGUAGE.

+

(23)

+

If you try changing from French +to Russian except in a UTF-8 locale, you will most likely find messages +change to English.

+

(24)

+

the +language written in England: some people living in the USA appropriate +this name for their language.

+

(25)

+

with +Americanisms.

+

(26)

+

also known as +IEEE 754

+

(27)

+

at least when storing quantities: the on-FPU +precision is allowed to vary

+

(28)

+

e.g. Bessel, beta and gamma functions

+

(29)

+

including copying MkRules.dist to +MkRule.local and selecting the architecture.

+

(30)

+

also known as +IEEE 754

+

(31)

+

Note +that C11 compilers need not be C99-compliant: R requires support for +double complex and variable-length arrays which are optional in +C11 but is mandatory in C99.

+

(32)

+

-std=c99 excludes POSIX +functionality, but config.h will turn on all GNU +extensions to include the POSIX functionality. The default mode for GCC +5 is -std=gnu11.

+

(33)

+

However, it is possible to break +the default behaviour of glibc by re-specifying the gconv +modules to be loaded.

+

(34)

+

specifically, the C99 +functionality of headers wchar.h and wctype.h, types +wctans_t and mbstate_t and functions mbrtowc, +mbstowcs, wcrtomb, wcscoll, wcstombs, +wctrans, wctype, and iswctype.

+

(35)

+

including the opendir, readdir, +closedir, popen, stat, glob, access, +getcwd and chdir system calls, and either putenv or +setenv.

+

(36)

+

such as +realpath, symlink.

+

(37)

+

Such as +GNU tar 1.15 or later, bsdtar (from +https://github.com/libarchive/libarchive/, as used by FreeBSD and OS +X 10.6 and later) or tar from the Heirloom Toolchest +(http://heirloom.sourceforge.net/tools.html).

+

(38)

+

texi2dvi is normally a shell +script. Some versions (including that from texinfo 5.2 and 6.0) +need to be run under bash rather than a Bourne shell.

+

(39)

+

If necessary the path to +pkg-config can be specified by setting PKGCONF in +config.site, on the configure command line or in the +environment.

+

(40)

+

also known as ttf-mscorefonts-installer in the +Debian/Ubuntu world: see also +https://en.wikipedia.org/wiki/Core_fonts_for_the_Web.

+

(41)

+

ttf-liberation +in Debian/Ubuntu.

+

(42)

+

sometimes known as PCRE1, and not PCRE2, which +started at version 10.0. PCRE must be built with UTF-8 support (not the +default, and checked by configure) and support for Unicode +properties is assumed by some R packages. JIT support is desirable +for the best performance: support for this and Unicode properties can be +checked at run-time by calling pcre_config().

+

(43)

+

for example to specify +static linking with a build which has both shared and static libraries.

+

(44)

+

This is true even for +the ‘Aqua’ version of Tk on OS X, but distributions of that include a +copy of the X11 files needed.

+

(45)

+

Using the Oracle Solaris Studio +cc and f95 compilers

+

(46)

+

and ‘i686’ for earlier +versions.

+

(47)

+

We have measured 15–20% on ‘i686’ Linux +and around 10% on ‘x86_64’ Linux.

+

(48)

+

On HP-UX fort77 is the +POSIX compliant FORTRAN compiler, and comes after +g77.

+

(49)

+

as well as its equivalence to the Rcomplex +structure defined in R_ext/Complex.h.

+

(50)

+

for example, X11 font at size 14 could not +be loaded.

+

(51)

+

or -mtune=corei7 for Intel Core +i3/15/17 with gcc >= 4.6.0.

+

(52)

+

This also needs the OpenMP runtime, +which is usually distributed separately, e.g. at +http://llvm.org/releases.

+

(53)

+

These +days that is defined by Apple’s implementation of clang, so it is +strongly recommended to use that.

+

(54)

+

This +is a tarball which needs to be unpacked in the Terminal by e.g. +sudo tar -zxf gfortran-4.8.2-darwin13.tar.bz2 -C /. It does +not run on Core 2 Duo Macs.

+

(55)

+

This also needs the OpenMP runtime, +which is distributed separately at that site.

+

(56)

+

It is reported that for some non-Apple toolchains +CPPFLAGS needed to contain -D__ACCELERATE__.

+

(57)

+

For more +details see http://www.macstrategy.com/article.php?3.

+

(58)

+

e.g. +Java For OS X 2015-001 from +https://support.apple.com/kb/DL1572.

+

(59)

+

including gcc for +Sparc from Oracle.

+

(60)

+

Windows +DLLs need to have all links resolved at build time and so cannot resolve +against R.bin.

+

(61)

+

For +example, the Cygwin version of make 3.81 fails to work +correctly.

+

(62)

+

for R 3.0.0 and later.

+

(63)

+

such as sort, find and perhaps +make.

+

(64)

+

these +flags apply to the compilers: some of the tools use different flags. +32-bit builds are the default.

+
+
+ + + + + diff --git a/R-data.html b/R-data.html new file mode 100644 index 0000000..53f6521 --- /dev/null +++ b/R-data.html @@ -0,0 +1,3190 @@ + + + + + +R Data Import/Export + + + + + + + + + + + + + + + + +

R Data Import/Export

+ + + + + + + + + + + + + + + + + + + + + + +

Table of Contents

+ + + + + +
+

+Next:   [Contents][Index]

+
+ +

R Data Import/Export

+ +

This is a guide to importing and exporting data to and from R. +

+

This manual is for R, version 3.2.3 (2015-12-10). +

+

Copyright © 2000–2015 R Core Team +

+
+

Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. +

+

Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided that the +entire resulting derived work is distributed under the terms of a +permission notice identical to this one. +

+

Permission is granted to copy and distribute translations of this manual +into another language, under the above conditions for modified versions, +except that this permission notice may be stated in a translation +approved by the R Core Team. +

+ + + + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Acknowledgements

+ +

The relational databases part of this manual is based in part on an +earlier manual by Douglas Bates and Saikat DebRoy. The principal author +of this manual was Brian Ripley. +

+

Many volunteers have contributed to the packages used here. The +principal authors of the packages mentioned are +

+
+ + + + + + + + + + + + + + + + + + + + + + +
DBIDavid A. James
dataframes2xlsGuido van Steen
foreignThomas Lumley, Saikat DebRoy, Douglas Bates, Duncan Murdoch and Roger Bivand
gdataGregory R. Warnes
hdf5Marcus Daniels
ncdf, ncdf4David Pierce
rJavaSimon Urbanek
RJDBCSimon Urbanek
RMySQLDavid James and Saikat DebRoy
RNetCDFPavel Michna
RODBCMichael Lapsley and Brian Ripley
ROracleDavid A, James
RPostgreSQLSameer Kumar Prayaga and Tomoaki Nishiyama
RSPerlDuncan Temple Lang
RSPythonDuncan Temple Lang
RSQLiteDavid A, James
SJavaJohn Chambers and Duncan Temple Lang
WriteXLSMarc Schwartz
XLConnectMirai Solutions GmbH
xlsReadWriteHans-Peter Suter
XMLDuncan Temple Lang
+
+ +

Brian Ripley is the author of the support for connections. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

1 Introduction

+ +

Reading data into a statistical system for analysis and exporting the +results to some other system for report writing can be frustrating tasks +that can take far more time than the statistical analysis itself, even +though most readers will find the latter far more appealing. +

+

This manual describes the import and export facilities available either +in R itself or via packages which are available from CRAN +or elsewhere. +

+

Unless otherwise stated, everything described in this manual is (at +least in principle) available on all platforms running R. +

+

In general, statistical systems like R are not particularly well +suited to manipulations of large-scale data. Some other systems are +better than R at this, and part of the thrust of this manual is to +suggest that rather than duplicating functionality in R we can make +another system do the work! (For example Therneau & Grambsch (2000) +commented that they preferred to do data manipulation in SAS and then +use package survival in S for the analysis.) Database +manipulation systems are often very suitable for manipulating and +extracting data: several packages to interact with DBMSs are discussed +here. +

+

There are packages to allow functionality developed in languages such as +Java, perl and python to be directly integrated +with R code, making the use of facilities in these languages even +more appropriate. (See the rJava package from CRAN and +the SJava, RSPerl and RSPython packages from the +Omegahat project, http://www.omegahat.org.) +

+ + + + +

It is also worth remembering that R like S comes from the Unix +tradition of small re-usable tools, and it can be rewarding to use tools +such as awk and perl to manipulate data before import or +after export. The case study in Becker, Chambers & Wilks (1988, Chapter +9) is an example of this, where Unix tools were used to check and +manipulate the data before input to S. The traditional Unix tools +are now much more widely available, including for Windows. +

+ + + + + + +
+ +
+

+Next: , Previous: , Up: Introduction   [Contents][Index]

+
+ +

1.1 Imports

+ + + + + + +

The easiest form of data to import into R is a simple text file, and +this will often be acceptable for problems of small or medium scale. +The primary function to import from a text file is scan, and this +underlies most of the more convenient functions discussed in +Spreadsheet-like data. +

+

However, all statistical consultants are familiar with being presented +by a client with a memory stick (formerly, a floppy disc or CD-R) of +data in some proprietary binary format, for example ‘an Excel +spreadsheet’ or ‘an SPSS file’. Often the simplest thing to do is to +use the originating application to export the data as a text file (and +statistical consultants will have copies of the most common applications +on their computers for that purpose). However, this is not always +possible, and Importing from other statistical systems discusses +what facilities are available to access such files directly from R. +For Excel spreadsheets, the available methods are summarized in +Reading Excel spreadsheets. For ODS spreadsheets from Open +Office, see the Omegahat package1 ROpenOffice. +

+

In a few cases, data have been stored in a binary form for compactness +and speed of access. One application of this that we have seen several +times is imaging data, which is normally stored as a stream of bytes as +represented in memory, possibly preceded by a header. Such data formats +are discussed in Binary files and Binary connections. +

+

For much larger databases it is common to handle the data using a +database management system (DBMS). There is once again the option of +using the DBMS to extract a plain file, but for many such DBMSs the +extraction operation can be done directly from an R package: +See Relational databases. Importing data via network connections is +discussed in Network interfaces. +

+
+ +
+

+Previous: , Up: Imports   [Contents][Index]

+
+ +

1.1.1 Encodings

+ + +

Unless the file to be imported from is entirely in ASCII, it +is usually necessary to know how it was encoded. For text files, a good +way to find out something about its structure is the file +command-line tool (for Windows, included in Rtools). This +reports something like +

+
+
text.Rd: UTF-8 Unicode English text
+text2.dat: ISO-8859 English text
+text3.dat: Little-endian UTF-16 Unicode English character data,
+   with CRLF line terminators
+intro.dat: UTF-8 Unicode text
+intro.dat: UTF-8 Unicode (with BOM) text
+
+ +

Modern Unix-alike systems, including OS X, are likely to produce +UTF-8 files. Windows may produce what it calls ‘Unicode’ files +(UCS-2LE or just possibly UTF-16LE2). Otherwise most files will be in a +8-bit encoding unless from a Chinese/Japanese/Korean locale (which have +a wide range of encodings in common use). It is not possible to +automatically detect with certainty which 8-bit encoding (although +guesses may be possible and file may guess as it did in the +example above), so you may simply have to ask the originator for some +clues (e.g. ‘Russian on Windows’). +

+

‘BOMs’ (Byte Order Marks, +https://en.wikipedia.org/wiki/Byte_order_mark) cause problems for +Unicode files. In the Unix world BOMs are rarely used, whereas in the +Windows world they almost always are for UCS-2/UTF-16 files, and often +are for UTF-8 files. The file utility will not even recognize +UCS-2 files without a BOM, but many other utilities will refuse to read +files with a BOM and the IANA standards for UTF-16LE +and UTF-16BE prohibit it. We have too often been reduced to +looking at the file with the command-line utility od or a hex +editor to work out its encoding. +

+

Note that utf8 is not a valid encoding name (UTF-8 is), +and macintosh is the most portable name for what is sometimes +called ‘Mac Roman’ encoding. +

+
+ +
+

+Next: , Previous: , Up: Introduction   [Contents][Index]

+
+ +

1.2 Export to text files

+ + +

Exporting results from R is usually a less contentious task, but +there are still a number of pitfalls. There will be a target +application in mind, and normally a text file will be the most +convenient interchange vehicle. (If a binary file is required, see +Binary files.) +

+ +

Function cat underlies the functions for exporting data. It +takes a file argument, and the append argument allows a +text file to be written via successive calls to cat. Better, +especially if this is to be done many times, is to open a file +connection for writing or appending, and cat to that connection, +then close it. +

+ + +

The most common task is to write a matrix or data frame to file as a +rectangular grid of numbers, possibly with row and column labels. This +can be done by the functions write.table and write. +Function write just writes out a matrix or vector in a specified +number of columns (and transposes a matrix). Function +write.table is more convenient, and writes out a data frame (or +an object that can be coerced to a data frame) with row and column +labels. +

+

There are a number of issues that need to be considered in writing out a +data frame to a text file. +

+
    +
  1. +Precision + +

    Most of the conversions of real/complex numbers done by these functions +is to full precision, but those by write are governed by the +current setting of options(digits). For more control, use +format on a data frame, possibly column-by-column. +

    +
  2. Header line + +

    R prefers the header line to have no entry for the row names, so the +file looks like +

    +
    +
                    dist    climb   time
    +Greenmantle     2.5     650     16.083
    +   ...
    +
    + +

    Some other systems require a (possibly empty) entry for the row names, which +is what write.table will provide if argument col.names = NA +is specified. Excel is one such system. +

    +
  3. Separator + + + + + +

    A common field separator to use in the file is a comma, as that is +unlikely to appear in any of the fields in English-speaking countries. +Such files are known as CSV (comma separated values) files, and wrapper +function write.csv provides appropriate defaults. In some +locales the comma is used as the decimal point (set this in +write.table by dec = ",") and there CSV files use the +semicolon as the field separator: use write.csv2 for appropriate +defaults. There is an IETF standard for CSV files (which mandates +commas and CRLF line endings, for which use eol = "\r\n"), RFC4180 +(see https://tools.ietf.org/html/rfc4180), but what is more +important in practice is that the file is readable by the application it +is targeted at. +

    +

    Using a semicolon or tab (sep = "\t") are probably the safest +options. +

    +
  4. Missing values + + +

    By default missing values are output as NA, but this may be +changed by argument na. Note that NaNs are treated as +NA by write.table, but not by cat nor write. +

    +
  5. Quoting strings + + +

    By default strings are quoted (including the row and column names). +Argument quote controls if character and factor variables are +quoted: some programs, for example Mondrian, do not accept quoted +strings (which are the default). +

    +

    Some care is needed if the strings contain embedded quotes. Three +useful forms are +

    +
    +
    > df <- data.frame(a = I("a \" quote"))
    +> write.table(df)
    +"a"
    +"1" "a \" quote"
    +> write.table(df, qmethod = "double")
    +"a"
    +"1" "a "" quote"
    +> write.table(df, quote = FALSE, sep = ",")
    +a
    +1,a " quote
    +
    + +

    The second is the form of escape commonly used by spreadsheets. +

    +
  6. Encodings + + +

    Text files do not contain metadata on their encodings, so for +non-ASCII data the file needs to be targetted to the +application intended to read it. All of these functions can write to a +connection which allows an encoding to be specified for the file, +and write.table has a fileEncoding argument to make this +easier. +

    +

    The hard part is to know what file encoding to use. For use on Windows, +it is best to use what Windows calls ‘Unicode’3, that is "UTF-16LE". Using UTF-8 is a good way +to make portable files that will not easily be confused with any other +encoding, but even OS X applications (where UTF-8 is the system +encoding) may not recognize them, and Windows applications are most +unlikely to. Apparently Excel:mac 2004/8 expects .csv files in +"macroman" encoding (the encoding used in much earlier versions +of Mac OS). +

    +
+ + +

Function write.matrix in package MASS provides a +specialized interface for writing matrices, with the option of writing +them in blocks and thereby reducing memory usage. +

+ +

It is possible to use sink to divert the standard R output to +a file, and thereby capture the output of (possibly implicit) +print statements. This is not usually the most efficient route, +and the options(width) setting may need to be increased. +

+ +

Function write.foreign in package foreign uses +write.table to produce a text file and also writes a code file +that will read this text file into another statistical package. There is +currently support for export to SAS, SPSS and Stata. +

+
+ +
+

+Previous: , Up: Introduction   [Contents][Index]

+
+ +

1.3 XML

+ + +

When reading data from text files, it is the responsibility of the user +to know and to specify the conventions used to create that file, +e.g. the comment character, whether a header line is present, the value +separator, the representation for missing values (and so on) described +in Export to text files. A markup language which can be used to +describe not only content but also the structure of the content can +make a file self-describing, so that one need not provide these details +to the software reading the data. +

+

The eXtensible Markup Language – more commonly known simply as +XML – can be used to provide such structure, not only for +standard datasets but also more complex data structures. +XML is becoming extremely popular and is emerging as a +standard for general data markup and exchange. It is being used by +different communities to describe geographical data such as maps, +graphical displays, mathematics and so on. +

+

XML provides a way to specify the file’s encoding, e.g. +

+
+
<?xml version="1.0" encoding="UTF-8"?>
+
+ +

although it does not require it. +

+

The XML package provides general facilities for reading and +writing XML documents within R. A description of the +facilities of the XML package is outside the scope of this +document: see the package’s Web page at +http://www.omegahat.org/RSXML for details and examples. Package +StatDataML on CRAN is one example building on +XML. +

+

NB: XML is available as a binary package for Windows, normally +from the ‘CRAN extras’ repository (which is selected by default on +Windows). +

+
+ + + +

2 Spreadsheet-like data

+ + + + + + + + + + + +

In Export to text files we saw a number of variations on the +format of a spreadsheet-like text file, in which the data are presented +in a rectangular grid, possibly with row and column labels. In this +section we consider importing such files into R. +

+
+ + + +

2.1 Variations on read.table

+ + +

The function read.table is the most convenient way to read in a +rectangular grid of data. Because of the many possibilities, there are +several other functions that call read.table but change a group +of default arguments. +

+

Beware that read.table is an inefficient way to read in +very large numerical matrices: see scan below. +

+

Some of the issues to consider are: +

+
    +
  1. Encoding + +

    If the file contains non-ASCII character fields, ensure that +it is read in the correct encoding. This is mainly an issue for reading +Latin-1 files in a UTF-8 locale, which can be done by something like +

    +
    +
    read.table("file.dat", fileEncoding="latin1")
    +
    + +

    Note that this will work in any locale which can represent Latin-1 +strings, but not many Greek/Russian/Chinese/Japanese … locales. +

    + +
  2. Header line + +

    We recommend that you specify the header argument explicitly, +Conventionally the header line has entries only for the columns and not +for the row labels, so is one field shorter than the remaining lines. +(If R sees this, it sets header = TRUE.) If presented with a +file that has a (possibly empty) header field for the row labels, read +it in by something like +

    +
    +
    read.table("file.dat", header = TRUE, row.names = 1)
    +
    + +

    Column names can be given explicitly via the col.names; explicit +names override the header line (if present). +

    +
  3. Separator + +

    Normally looking at the file will determine the field separator to be +used, but with white-space separated files there may be a choice between +the default sep = "" which uses any white space (spaces, tabs or +newlines) as a separator, sep = " " and sep = "\t". Note +that the choice of separator affects the input of quoted strings. +

    +

    If you have a tab-delimited file containing empty fields be sure to use +sep = "\t". +

    + +
  4. Quoting + + +

    By default character strings can be quoted by either ‘"’ or +‘'’, and in each case all the characters up to a matching quote are +taken as part of the character string. The set of valid quoting +characters (which might be none) is controlled by the quote +argument. For sep = "\n" the default is changed to quote = +"". +

    +

    If no separator character is specified, quotes can be escaped within +quoted strings by immediately preceding them by ‘\’, C-style. +

    +

    If a separator character is specified, quotes can be escaped within +quoted strings by doubling them as is conventional in spreadsheets. For +example +

    +
    +
    'One string isn''t two',"one more"
    +
    + +

    can be read by +

    +
    +
    read.table("testfile", sep = ",")
    +
    + +

    This does not work with the default separator. +

    +
  5. Missing values + + +

    By default the file is assumed to contain the character string NA +to represent missing values, but this can be changed by the argument +na.strings, which is a vector of one or more character +representations of missing values. +

    +

    Empty fields in numeric columns are also regarded as missing values. +

    +

    In numeric columns, the values NaN, Inf and -Inf are +accepted. +

    +
  6. Unfilled lines + +

    It is quite common for a file exported from a spreadsheet to have all +trailing empty fields (and their separators) omitted. To read such +files set fill = TRUE. +

    +
  7. White space in character fields + +

    If a separator is specified, leading and trailing white space in +character fields is regarded as part of the field. To strip the space, +use argument strip.white = TRUE. +

    +
  8. Blank lines + +

    By default, read.table ignores empty lines. This can be changed +by setting blank.lines.skip = FALSE, which will only be useful in +conjunction with fill = TRUE, perhaps to use blank rows to +indicate missing cases in a regular layout. +

    +
  9. Classes for the variables + +

    Unless you take any special action, read.table reads all the +columns as character vectors and then tries to select a suitable class +for each variable in the data frame. It tries in turn logical, +integer, numeric and complex, moving on if any +entry is not missing and cannot be converted.4 +If all of these fail, the variable is converted to a factor. +

    +

    Arguments colClasses and as.is provide greater control. +Specifying as.is = TRUE suppresses conversion of character +vectors to factors (only). Using colClasses allows the desired +class to be set for each column in the input: it will be faster and use +less memory. +

    +

    Note that colClasses and as.is are specified per +column, not per variable, and so include the column of row names +(if any). +

    +
  10. Comments + +

    By default, read.table uses ‘#’ as a comment character, +and if this is encountered (except in quoted strings) the rest of the +line is ignored. Lines containing only white space and a comment are +treated as blank lines. +

    +

    If it is known that there will be no comments in the data file, it is +safer (and may be faster) to use comment.char = "". +

    +
  11. Escapes + +

    Many OSes have conventions for using backslash as an escape character in +text files, but Windows does not (and uses backslash in path names). +It is optional in R whether such conventions are applied to data files. +

    +

    Both read.table and scan have a logical argument +allowEscapes. This is false by default, and backslashes are then +only interpreted as (under circumstances described above) escaping +quotes. If this set to be true, C-style escapes are interpreted, namely +the control characters \a, \b, \f, \n, \r, \t, \v and octal and +hexadecimal representations like \040 and \0x2A. Any +other escaped character is treated as itself, including backslash. Note +that Unicode escapes such as \uxxxx are never interpreted. +

    +
  12. Encoding + +

    This can be specified by the fileEncoding argument, for example +

    +
    +
    fileEncoding = "UCS-2LE"    # Windows ‘Unicode’ files
    +fileEncoding = "UTF-8"
    +
    + +

    If you know (correctly) the file’s encoding this will almost always +work. However, we know of one exception, UTF-8 files with a BOM. Some +people claim that UTF-8 files should never have a BOM, but some software +(apparently including Excel:mac) uses them, and many Unix-alike OSes do +not accept them. So faced with a file which file reports as +

    +
    +
    intro.dat: UTF-8 Unicode (with BOM) text
    +
    + +

    it can be read on Windows by +

    +
    +
    read.table("intro.dat", fileEncoding = "UTF-8")
    +
    + +

    but on a Unix-alike might need +

    +
    +
    read.table("intro.dat", fileEncoding = "UTF-8-BOM")
    +
    + +

    (This would most likely work without specifying an encoding in a UTF-8 locale.) +

    +

    Another problem with this (real-life) example is that whereas +file-5.03 reported the BOM, file-4.17 found on OS +10.5 (Leopard) did not. +

+ + + + + + + + +

Convenience functions read.csv and read.delim provide +arguments to read.table appropriate for CSV and tab-delimited +files exported from spreadsheets in English-speaking locales. The +variations read.csv2 and read.delim2 are appropriate for +use in those locales where the comma is used for the decimal point and +(for read.csv2) for spreadsheets which use semicolons to separate +fields. +

+

If the options to read.table are specified incorrectly, the error +message will usually be of the form +

+
+
Error in scan(file = file, what = what, sep = sep, : 
+        line 1 did not have 5 elements
+
+ +

or +

+
+
Error in read.table("files.dat", header = TRUE) : 
+        more columns than column names
+
+ + + +

This may give enough information to find the problem, but the auxiliary +function count.fields can be useful to investigate further. +

+

Efficiency can be important when reading large data grids. It will help +to specify comment.char = "", colClasses as one of the +atomic vector types (logical, integer, numeric, complex, character or +perhaps raw) for each column, and to give nrows, the number of +rows to be read (and a mild over-estimate is better than not specifying +this at all). See the examples in later sections. +

+ +
+ + + +

2.2 Fixed-width-format files

+ + +

Sometimes data files have no field delimiters but have fields in +pre-specified columns. This was very common in the days of punched +cards, and is still sometimes used to save file space. +

+ +

Function read.fwf provides a simple way to read such files, +specifying a vector of field widths. The function reads the file into +memory as whole lines, splits the resulting character strings, writes +out a temporary tab-separated file and then calls read.table. +This is adequate for small files, but for anything more complicated we +recommend using the facilities of a language like perl to +pre-process the file. + +

+ + +

Function read.fortran is a similar function for fixed-format files, +using Fortran-style column specifications. +

+
+ + + +

2.3 Data Interchange Format (DIF)

+ + +

An old format sometimes used for spreadsheet-like data is DIF, or Data Interchange +format. +

+ +

Function read.DIF provides a simple way to read such files. It takes +arguments similar to read.table for assigning types to each of the columns. +

+

On Windows, spreadsheet programs often store spreadsheet data copied to +the clipboard in this format; read.DIF("clipboard") can read it +from there directly. It is slightly more robust than +read.table("clipboard") in handling spreadsheets with empty +cells. +

+
+ + + +

2.4 Using scan directly

+ + +

Both read.table and read.fwf use scan to read the +file, and then process the results of scan. They are very +convenient, but sometimes it is better to use scan directly. +

+

Function scan has many arguments, most of which we have already +covered under read.table. The most crucial argument is +what, which specifies a list of modes of variables to be read +from the file. If the list is named, the names are used for the +components of the returned list. Modes can be numeric, character or +complex, and are usually specified by an example, e.g. 0, +"" or 0i. For example +

+
+
cat("2 3 5 7", "11 13 17 19", file="ex.dat", sep="\n")
+scan(file="ex.dat", what=list(x=0, y="", z=0), flush=TRUE)
+
+ +

returns a list with three components and discards the fourth column in +the file. +

+ +

There is a function readLines which will be more convenient if +all you want is to read whole lines into R for further processing. +

+

One common use of scan is to read in a large matrix. Suppose +file matrix.dat just contains the numbers for a 200 x 2000 +matrix. Then we can use +

+
+
A <- matrix(scan("matrix.dat", n = 200*2000), 200, 2000, byrow = TRUE)
+
+ +

On one test this took 1 second (under Linux, 3 seconds under Windows on +the same machine) whereas +

+
+
A <- as.matrix(read.table("matrix.dat"))
+
+ +

took 10 seconds (and more memory), and +

+
+
A <- as.matrix(read.table("matrix.dat", header = FALSE, nrows = 200,
+                          comment.char = "", colClasses = "numeric"))
+
+ +

took 7 seconds. The difference is almost entirely due to the overhead +of reading 2000 separate short columns: were they of length 2000, +scan took 9 seconds whereas read.table took 18 if used +efficiently (in particular, specifying colClasses) and 125 if +used naively. +

+ +

Note that timings can depend on the type read and the data. +Consider reading a million distinct integers: +

+
writeLines(as.character((1+1e6):2e6), "ints.dat")
+xi <- scan("ints.dat", what=integer(0), n=1e6)   # 0.77s
+xn <- scan("ints.dat", what=numeric(0), n=1e6)   # 0.93s
+xc <- scan("ints.dat", what=character(0), n=1e6) # 0.85s
+xf <- as.factor(xc)                              # 2.2s
+DF <- read.table("ints.dat")                     # 4.5s
+
+

and a million examples of a small set of codes: +

+
code <- c("LMH", "SJC", "CHCH", "SPC", "SOM")
+writeLines(sample(code, 1e6, replace=TRUE), "code.dat")
+y <- scan("code.dat", what=character(0), n=1e6)  # 0.44s
+yf <- as.factor(y)                               # 0.21s
+DF <- read.table("code.dat")                     # 4.9s
+DF <- read.table("code.dat", nrows=1e6)          # 3.6s
+
+ +

Note that these timings depend heavily on the operating system (the +basic reads in Windows take at least as twice as long as these Linux +times) and on the precise state of the garbage collector. +

+ +
+ + + +

2.5 Re-shaping data

+ + +

Sometimes spreadsheet data is in a compact format that gives the +covariates for each subject followed by all the observations on that +subject. R’s modelling functions need observations in a single +column. Consider the following sample of data from repeated MRI brain +measurements +

+
+
 Status   Age    V1     V2     V3    V4
+      P 23646 45190  50333  55166 56271
+     CC 26174 35535  38227  37911 41184
+     CC 27723 25691  25712  26144 26398
+     CC 27193 30949  29693  29754 30772
+     CC 24370 50542  51966  54341 54273
+     CC 28359 58591  58803  59435 61292
+     CC 25136 45801  45389  47197 47126
+
+ +

There are two covariates and up to four measurements on each subject. +The data were exported from Excel as a file mr.csv. +

+ +

We can use stack to help manipulate these data to give a single +response. +

+
+
zz <- read.csv("mr.csv", strip.white = TRUE)
+zzz <- cbind(zz[gl(nrow(zz), 1, 4*nrow(zz)), 1:2], stack(zz[, 3:6]))
+
+ +

with result +

+
+
      Status   Age values ind
+X1         P 23646  45190  V1
+X2        CC 26174  35535  V1
+X3        CC 27723  25691  V1
+X4        CC 27193  30949  V1
+X5        CC 24370  50542  V1
+X6        CC 28359  58591  V1
+X7        CC 25136  45801  V1
+X11        P 23646  50333  V2
+...
+
+ + +

Function unstack goes in the opposite direction, and may be +useful for exporting data. +

+ +

Another way to do this is to use the function +reshape, by +

+
+
> reshape(zz, idvar="id",timevar="var",
+  varying=list(c("V1","V2","V3","V4")),direction="long")
+    Status   Age var    V1 id
+1.1      P 23646   1 45190  1
+2.1     CC 26174   1 35535  2
+3.1     CC 27723   1 25691  3
+4.1     CC 27193   1 30949  4
+5.1     CC 24370   1 50542  5
+6.1     CC 28359   1 58591  6
+7.1     CC 25136   1 45801  7
+1.2      P 23646   2 50333  1
+2.2     CC 26174   2 38227  2
+...
+
+ +

The reshape function has a more complicated syntax than +stack but can be used for data where the ‘long’ form has more +than the one column in this example. With direction="wide", +reshape can also perform the opposite transformation. +

+

Some people prefer the tools in packages reshape, +reshape2 and plyr. +

+
+ + + +

2.6 Flat contingency tables

+ + +

Displaying higher-dimensional contingency tables in array form typically +is rather inconvenient. In categorical data analysis, such information +is often represented in the form of bordered two-dimensional arrays with +leading rows and columns specifying the combination of factor levels +corresponding to the cell counts. These rows and columns are typically +“ragged” in the sense that labels are only displayed when they change, +with the obvious convention that rows are read from top to bottom and +columns are read from left to right. In R, such “flat” contingency +tables can be created using ftable, + +which creates objects of class "ftable" with an appropriate print +method. +

+

As a simple example, consider the R standard data set +UCBAdmissions which is a 3-dimensional contingency table +resulting from classifying applicants to graduate school at UC Berkeley +for the six largest departments in 1973 classified by admission and sex. +

+
+
> data(UCBAdmissions)
+> ftable(UCBAdmissions)
+                Dept   A   B   C   D   E   F
+Admit    Gender                             
+Admitted Male        512 353 120 138  53  22
+         Female       89  17 202 131  94  24
+Rejected Male        313 207 205 279 138 351
+         Female       19   8 391 244 299 317
+
+ +

The printed representation is clearly more useful than displaying the +data as a 3-dimensional array. +

+

There is also a function read.ftable for reading in flat-like +contingency tables from files. + +This has additional arguments for dealing with variants on how exactly +the information on row and column variables names and levels is +represented. The help page for read.ftable has some useful +examples. The flat tables can be converted to standard contingency +tables in array form using as.table. +

+

Note that flat tables are characterized by their “ragged” display of +row (and maybe also column) labels. If the full grid of levels of the +row variables is given, one should instead use read.table to read +in the data, and create the contingency table from this using +xtabs. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

3 Importing from other statistical systems

+ + +

In this chapter we consider the problem of reading a binary data file +written by another statistical system. This is often best avoided, but +may be unavoidable if the originating system is not available. +

+

In all cases the facilities described were written for data files from +specific versions of the other system (often in the early 2000s), and +have not necessarily been updated for the most recent versions of the +other system. +

+ + + + + +
+ + + +

3.1 EpiInfo, Minitab, S-PLUS, SAS, SPSS, Stata, Systat

+ +

The recommended package foreign provides import facilities for +files produced by these statistical systems, and for export to Stata. In +some cases these functions may require substantially less memory than +read.table would. write.foreign (See Export to text files) provides an export mechanism with support currently for +SAS, SPSS and Stata. +

+ + + +

EpiInfo versions 5 and 6 stored data in a self-describing fixed-width +text format. read.epiinfo will read these .REC files into +an R data frame. EpiData also produces data in this format. +

+ + +

Function read.mtp imports a ‘Minitab Portable Worksheet’. This +returns the components of the worksheet as an R list. +

+ + +

Function read.xport reads a file in SAS Transport (XPORT) format +and return a list of data frames. If SAS is available on your system, +function read.ssd can be used to create and run a SAS script that +saves a SAS permanent dataset (.ssd or .sas7bdat) in +Transport format. It then calls read.xport to read the resulting +file. (Package Hmisc has a similar function sas.get, also +running SAS.) For those without access to SAS but running on Windows, +the SAS System Viewer (a zero-cost download) can be used to open SAS +datasets and export them to e.g. .csv format. +

+ + + + +

Function read.S which can read binary objects produced by S-PLUS +3.x, 4.x or 2000 on (32-bit) Unix or Windows (and can read them on a +different OS). This is able to read many but not all S objects: in +particular it can read vectors, matrices and data frames and lists +containing those. +

+

Function data.restore reads S-PLUS data dumps (created by +data.dump) with the same restrictions (except that dumps from the +Alpha platform can also be read). It should be possible to read data +dumps from S-PLUS 5.x and later written with data.dump(oldStyle=T). +

+

If you have access to S-PLUS, it is usually more reliable to dump +the object(s) in S-PLUS and source the dump file in R. For +S-PLUS 5.x and later you may need to use dump(..., oldStyle=T), +and to read in very large objects it may be preferable to use the dump +file as a batch script rather than use the source function. +

+ + + +

Function read.spss can read files created by the ‘save’ and +‘export’ commands in SPSS. It returns a list with one +component for each variable in the saved data set. SPSS +variables with value labels are optionally converted to R factors. +

+

SPSS Data Entry is an application for creating data entry +forms. By default it creates data files with extra formatting +information that read.spss cannot handle, but it is possible to +export the data in an ordinary SPSS format. +

+

Some third-party applications claim to produce data ‘in SPSS format’ but +with differences in the formats: read.spss may or may not be able +to handle these. +

+ + + +

Stata .dta files are a binary file format. Files from versions 5 +up to 11 of Stata can be read and written by functions read.dta +and write.dta. Stata variables with value labels are optionally +converted to (and from) R factors. Stata version 12 by default +writes ‘format-115 datasets’: read.dta currently may not be able +to read those. +

+ + + +

read.systat reads those Systat SAVE files that are +rectangular data files (mtype = 1) written on little-endian +machines (such as from Windows). These have extension .sys +or (more recently) .syd. +

+ +
+ + + +

3.2 Octave

+ + + +

Octave is a numerical linear algebra system +(http://www.octave.org), and function read.octave in +package foreign can read in files in Octave text data format +created using the Octave command save -ascii, with support for +most of the common types of variables, including the standard atomic +(real and complex scalars, matrices, and N-d arrays, strings, +ranges, and boolean scalars and matrices) and recursive (structs, cells, +and lists) ones. +

+
+ + + +

4 Relational databases

+ + + + + + + + + + +
+ + + +

4.1 Why use a database?

+ +

There are limitations on the types of data that R handles well. +Since all data being manipulated by R are resident in memory, and +several copies of the data can be created during execution of a +function, R is not well suited to extremely large data sets. Data +objects that are more than a (few) hundred megabytes in size can cause +R to run out of memory, particularly on a 32-bit operating system. +

+

R does not easily support concurrent access to data. That is, if +more than one user is accessing, and perhaps updating, the same data, +the changes made by one user will not be visible to the others. +

+

R does support persistence of data, in that you can save a data +object or an entire worksheet from one session and restore it at the +subsequent session, but the format of the stored data is specific to +R and not easily manipulated by other systems. +

+

Database management systems (DBMSs) and, in particular, relational +DBMSs (RDBMSs) are designed to do all of these things well. +Their strengths are +

+
    +
  1. To provide fast access to selected parts of large databases. + +
  2. Powerful ways to summarize and cross-tabulate columns in databases. + +
  3. Store data in more organized ways than the rectangular grid model of +spreadsheets and R data frames. + +
  4. Concurrent access from multiple clients running on multiple hosts while +enforcing security constraints on access to the data. + +
  5. Ability to act as a server to a wide range of clients. +
+ +

The sort of statistical applications for which DBMS might be used are to +extract a 10% sample of the data, to cross-tabulate data to produce a +multi-dimensional contingency table, and to extract data group by group +from a database for separate analysis. +

+

Increasingly OSes are themselves making use of DBMSs for these reasons, +so it is nowadays likely that one will be already installed on your +(non-Windows) OS. Akonadi +is used by KDE4 to store personal information. Several OS X +applications, including Mail and Address Book, use SQLite. +

+
+ + + +

4.2 Overview of RDBMSs

+ +

Traditionally there had been large (and expensive) commercial RDBMSs +(Informix; Oracle; Sybase; +IBM’s DB2; +Microsoft SQL +Server on Windows) and academic and small-system databases (such as +MySQL5, PostgreSQL, Microsoft +Access, …), the former marked out by much greater emphasis on data +security features. The line is blurring, with MySQL and PostgreSQL +having more and more high-end features, and free ‘express’ versions +being made available for the commercial DBMSs. +

+ + +

There are other commonly used data sources, including spreadsheets, +non-relational databases and even text files (possibly compressed). +Open Database Connectivity (ODBC) is a standard to use all of +these data sources. It originated on Windows (see +https://msdn.microsoft.com/en-us/library/ms710252%28v=vs.85%29.aspx) +but is also implemented on Linux/Unix/OS X. +

+

All of the packages described later in this chapter provide clients to +client/server databases. The database can reside on the same machine or +(more often) remotely. There is an ISO standard (in fact +several: SQL92 is ISO/IEC 9075, also known as +ANSI X3.135-1992, and SQL99 is coming into use) for +an interface language called SQL (Structured Query Language, +sometimes pronounced ‘sequel’: see Bowman et al. 1996 and Kline +and Kline 2001) which these DBMSs support to varying degrees. +

+ + + + + + +
+ + + +

4.2.1 SQL queries

+ + +

The more comprehensive R interfaces generate SQL behind the +scenes for common operations, but direct use of SQL is needed +for complex operations in all. Conventionally SQL is written +in upper case, but many users will find it more convenient to use lower +case in the R interface functions. +

+

A relational DBMS stores data as a database of tables (or +relations) which are rather similar to R data frames, in that +they are made up of columns or fields of one type +(numeric, character, date, currency, …) and rows or +records containing the observations for one entity. +

+

SQL ‘queries’ are quite general operations on a relational +database. The classical query is a SELECT statement of the type +

+
+
SELECT State, Murder FROM USArrests WHERE Rape > 30 ORDER BY Murder
+
+SELECT t.sch, c.meanses, t.sex, t.achieve
+  FROM student as t, school as c WHERE t.sch = c.id
+
+SELECT sex, COUNT(*) FROM student GROUP BY sex
+
+SELECT sch, AVG(sestat) FROM student GROUP BY sch LIMIT 10
+
+ +

The first of these selects two columns from the R data frame +USArrests that has been copied across to a database table, +subsets on a third column and asks the results be sorted. The second +performs a database join on two tables student and +school and returns four columns. The third and fourth queries do +some cross-tabulation and return counts or averages. (The five +aggregation functions are COUNT(*) and SUM, MAX, MIN and AVG, each +applied to a single column.) +

+

SELECT queries use FROM to select the table, WHERE to specify a +condition for inclusion (or more than one condition separated by AND or +OR), and ORDER BY to sort the result. Unlike data frames, rows in RDBMS +tables are best thought of as unordered, and without an ORDER BY +statement the ordering is indeterminate. You can sort (in +lexicographical order) on more than one column by separating them by +commas. Placing DESC after an ORDER BY puts the sort in descending +order. +

+

SELECT DISTINCT queries will only return one copy of each distinct row +in the selected table. +

+

The GROUP BY clause selects subgroups of the rows according to the +criterion. If more than one column is specified (separated by commas) +then multi-way cross-classifications can be summarized by one of the +five aggregation functions. A HAVING clause allows the select to +include or exclude groups depending on the aggregated value. +

+

If the SELECT statement contains an ORDER BY statement that produces a +unique ordering, a LIMIT clause can be added to select (by number) a +contiguous block of output rows. This can be useful to retrieve rows a +block at a time. (It may not be reliable unless the ordering is unique, +as the LIMIT clause can be used to optimize the query.) +

+

There are queries to create a table (CREATE TABLE, but usually one +copies a data frame to the database in these interfaces), INSERT or +DELETE or UPDATE data. A table is destroyed by a DROP TABLE ‘query’. +

+

Kline and Kline (2001) discuss the details of the implementation of SQL +in Microsoft SQL Server 2000, Oracle, MySQL and PostgreSQL. +

+
+ +
+

+Previous: , Up: Overview of RDBMSs   [Contents][Index]

+
+ +

4.2.2 Data types

+ +

Data can be stored in a database in various data types. The range of +data types is DBMS-specific, but the SQL standard defines many +types, including the following that are widely implemented (often not by +the SQL name). +

+
+
float(p)
+

Real number, with optional precision. Often called real or +double or double precision. +

+
integer
+

32-bit integer. Often called int. +

+
smallint
+

16-bit integer +

+
character(n)
+

fixed-length character string. Often called char. +

+
character varying(n)
+

variable-length character string. Often called varchar. Almost +always has a limit of 255 chars. +

+
boolean
+

true or false. Sometimes called bool or bit. +

+
date
+

calendar date +

+
time
+

time of day +

+
timestamp
+

date and time +

+
+ +

There are variants on time and timestamp, with +timezone. Other types widely implemented are text and +blob, for large blocks of text and binary data, respectively. +

+

The more comprehensive of the R interface packages hide the type +conversion issues from the user. +

+
+ + + +

4.3 R interface packages

+ +

There are several packages available on CRAN to help R +communicate with DBMSs. They provide different levels of abstraction. +Some provide means to copy whole data frames to and from databases. All +have functions to select data within the database via SQL +queries, and to retrieve the result as a whole as a +data frame or in pieces (usually as groups of rows). +

+

All except RODBC are tied to one DBMS, but there has been a +proposal for a unified ‘front-end’ package DBI +(https://developer.r-project.org/db) in conjunction with a +‘back-end’, the most developed of which is RMySQL. Also on +CRAN are the back-ends ROracle, RPostgreSQL and +RSQLite (which works with the bundled DBMS SQLite, +https://www.sqlite.org), RJDBC (which uses Java and can +connect to any DBMS that has a JDBC driver) and RpgSQL (a +specialist interface to PostgreSQL built on top of RJDBC). +

+

The BioConductor project has updated RdbiPgSQL (formerly on +CRAN ca 2000), a first-generation interface to PostgreSQL. +

+

PL/R (http://www.joeconway.com/plr/) is a project to embed R into +PostgreSQL. +

+

Package RMongo provides an R interface to a Java client for +‘MongoDB’ (https://en.wikipedia.org/wiki/MongoDB) databases, which +are queried using JavaScript rather than SQL. Package rmongodb is +another client using mongodb’s C driver. +

+ + + + + + + +
+ + + +

4.3.1 Packages using DBI

+ + +

Package RMySQL on CRAN provides an interface to the +MySQL database system (see https://www.mysql.com and Dubois, +2000) or its fork MariaDB (see https://mariadb.org/). The +description here applies to versions 0.5-0 and later: earlier +versions had a substantially different interface. The current version +requires the DBI package, and this description will apply with +minor changes to all the other back-ends to DBI. +

+

MySQL exists on Unix/Linux/OS X and Windows: there is a ‘Community +Edition’ released under GPL but commercial licenses are also available. +MySQL was originally a ‘light and lean’ database. (It preserves the +case of names where the operating file system is case-sensitive, so not +on Windows.) +

+ + + + +

The call dbDriver("MySQL") returns a database connection manager +object, and then a call to dbConnect opens a database connection +which can subsequently be closed by a call to the generic function +dbDisconnect. Use dbDriver("Oracle"), +dbDriver("PostgreSQL") or dbDriver("SQLite") with those +DBMSs and packages ROracle, RPostgreSQL or RSQLite +respectively. +

+ + + +

SQL queries can be sent by either dbSendQuery or +dbGetQuery. dbGetquery sends the query and retrieves the +results as a data frame. dbSendQuery sends the query and returns +an object of class inheriting from "DBIResult" which can be used +to retrieve the results, and subsequently used in a call to +dbClearResult to remove the result. +

+ +

Function fetch is used to retrieve some or all of the rows in the +query result, as a list. The function dbHasCompleted indicates if +all the rows have been fetched, and dbGetRowCount returns the +number of rows in the result. +

+ + + + +

These are convenient interfaces to read/write/test/delete tables in the +database. dbReadTable and dbWriteTable copy to and from +an R data frame, mapping the row names of the data frame to the field +row_names in the MySQL table. +

+
+
> library(RMySQL) # will load DBI as well
+## open a connection to a MySQL database
+> con <- dbConnect(dbDriver("MySQL"), dbname = "test")
+## list the tables in the database
+> dbListTables(con)
+## load a data frame into the database, deleting any existing copy
+> data(USArrests)
+> dbWriteTable(con, "arrests", USArrests, overwrite = TRUE)
+TRUE
+> dbListTables(con)
+[1] "arrests"
+## get the whole table
+> dbReadTable(con, "arrests")
+               Murder Assault UrbanPop Rape
+Alabama          13.2     236       58 21.2
+Alaska           10.0     263       48 44.5
+Arizona           8.1     294       80 31.0
+Arkansas          8.8     190       50 19.5
+...
+## Select from the loaded table
+> dbGetQuery(con, paste("select row_names, Murder from arrests",
+                        "where Rape > 30 order by Murder"))
+   row_names Murder
+1   Colorado    7.9
+2    Arizona    8.1
+3 California    9.0
+4     Alaska   10.0
+5 New Mexico   11.4
+6   Michigan   12.1
+7     Nevada   12.2
+8    Florida   15.4
+> dbRemoveTable(con, "arrests")
+> dbDisconnect(con)
+
+ +
+ +
+

+Previous: , Up: R interface packages   [Contents][Index]

+
+ +

4.3.2 Package RODBC

+ + + +

Package RODBC on CRAN provides an interface to +database sources supporting an ODBC interface. This is very +widely available, and allows the same R code to access different +database systems. RODBC runs on Unix/Linux, Windows and OS X, +and almost all database systems provide support for ODBC. We +have tested Microsoft SQL Server, Access, MySQL, PostgreSQL, Oracle and +IBM DB2 on Windows and MySQL, MariaDB, Oracle, PostgreSQL and SQLite on +Linux. +

+

ODBC is a client-server system, and we have happily connected to a DBMS +running on a Unix server from a Windows client, and vice versa. +

+

On Windows ODBC support is part of the OS. On Unix/Linux you will need +an ODBC Driver Manager such as unixODBC +(http://www.unixODBC.org) or iOBDC (http://www.iODBC.org: +this is pre-installed in OS X) and an installed driver for your +database system. +

+ + + + +

Windows provides drivers not just for DBMSs but also for Excel +(.xls) spreadsheets, DBase (.dbf) files and even text +files. (The named applications do not need to be +installed. Which file formats are supported depends on the versions of +the drivers.) There are versions for Excel and Access 2007/2010 (go to +https://www.microsoft.com/en-us/download/default.aspx, and +search for ‘Office ODBC’, which will lead to +AccessDatabaseEngine.exe), the ‘2007 Office System Driver’ (the +latter has a version for 64-bit Windows, and that will also read earlier +versions). +

+

On OS X the Actual Technologies +(https://www.actualtech.com/product_access.php) drivers +provide ODBC interfaces to Access databases (including Access 2007/2010) +and to Excel spreadsheets (not including Excel 2007/2010). +

+ + + +

Many simultaneous connections are possible. A connection is opened by a +call to odbcConnect or odbcDriverConnect (which on the +Windows GUI allows a database to be selected via dialog boxes) which +returns a handle used for subsequent access to the database. Printing a +connection will provide some details of the ODBC connection, and calling +odbcGetInfo will give details on the client and server. +

+ + + +

A connection is closed by a call to close or odbcClose, +and also (with a warning) when not R object refers to it and at the end +of an R session. +

+ +

Details of the tables on a connection can be found using +sqlTables. +

+ + +

Function sqlSave copies an R data frame to a table in the +database, and sqlFetch copies a table in the database to an R +data frame. +

+ + + + + +

An SQL query can be sent to the database by a call to +sqlQuery. This returns the result in an R data frame. +(sqlCopy sends a query to the database and saves the result as a +table in the database.) A finer level of control is attained by first +calling odbcQuery and then sqlGetResults to fetch the +results. The latter can be used within a loop to retrieve a limited +number of rows at a time, as can function sqlFetchMore. +

+ +

Here is an example using PostgreSQL, for which the ODBC driver +maps column and data frame names to lower case. We use a database +testdb we created earlier, and had the DSN (data source name) set +up in ~/.odbc.ini under unixODBC. Exactly the same code +worked using MyODBC to access a MySQL database under Linux or Windows +(where MySQL also maps names to lowercase). Under Windows, +DSNs are set up in the ODBC applet in the Control +Panel (‘Data Sources (ODBC)’ in the ‘Administrative Tools’ section). + +

+
+
> library(RODBC)
+## tell it to map names to l/case
+> channel <- odbcConnect("testdb", uid="ripley", case="tolower")
+## load a data frame into the database
+> data(USArrests)
+> sqlSave(channel, USArrests, rownames = "state", addPK = TRUE)
+> rm(USArrests)
+## list the tables in the database
+> sqlTables(channel)
+  TABLE_QUALIFIER TABLE_OWNER TABLE_NAME TABLE_TYPE REMARKS
+1                              usarrests      TABLE        
+## list it
+> sqlFetch(channel, "USArrests", rownames = "state")
+               murder assault urbanpop rape
+Alabama          13.2     236       58 21.2
+Alaska           10.0     263       48 44.5
+    ...
+## an SQL query, originally on one line
+> sqlQuery(channel, "select state, murder from USArrests
+           where rape > 30 order by murder")
+       state murder
+1 Colorado      7.9
+2 Arizona       8.1
+3 California    9.0
+4 Alaska       10.0
+5 New Mexico   11.4
+6 Michigan     12.1
+7 Nevada       12.2
+8 Florida      15.4
+## remove the table
+> sqlDrop(channel, "USArrests")
+## close the connection
+> odbcClose(channel)
+
+ + + + +

As a simple example of using ODBC under Windows with a Excel +spreadsheet, we can read from a spreadsheet by +

+
+
> library(RODBC)
+> channel <- odbcConnectExcel("bdr.xls")
+## list the spreadsheets
+> sqlTables(channel)
+  TABLE_CAT TABLE_SCHEM        TABLE_NAME   TABLE_TYPE REMARKS
+1 C:\\bdr            NA           Sheet1$ SYSTEM TABLE      NA
+2 C:\\bdr            NA           Sheet2$ SYSTEM TABLE      NA
+3 C:\\bdr            NA           Sheet3$ SYSTEM TABLE      NA
+4 C:\\bdr            NA Sheet1$Print_Area        TABLE      NA
+## retrieve the contents of sheet 1, by either of
+> sh1 <- sqlFetch(channel, "Sheet1")
+> sh1 <- sqlQuery(channel, "select * from [Sheet1$]")
+
+ +

Notice that the specification of the table is different from the name +returned by sqlTables: sqlFetch is able to map the +differences. +

+ + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

5 Binary files

+ + + + + + + +

Binary connections (Connections) are now the preferred way to +handle binary files. +

+ + +
+ +
+

+Next: , Previous: , Up: Binary files   [Contents][Index]

+
+ +

5.1 Binary data formats

+ + + + + + +

Packages hdf5, h5r, Bioconductor’s rhdf5, +RNetCDF, ncdf and ncdf4 on CRAN provide +interfaces to NASA’s HDF5 (Hierarchical Data Format, see +https://www.hdfgroup.org/HDF5/) and to UCAR’s netCDF data files +(network Common Data Form, see +http://www.unidata.ucar.edu/software/netcdf/). +

+

Both of these are systems to store scientific data in array-oriented +ways, including descriptions, labels, formats, units, …. HDF5 also +allows groups of arrays, and the R interface maps lists +to HDF5 groups, and can write numeric and character vectors and +matrices. +

+

NetCDF’s version 4 format (confusingly, implemented in netCDF 4.1.1 and +later, but not in 4.0.1) includes the use of various HDF5 formats. This +is handled by package ncdf4 whereas RNetCDF and +ncdf handle version 3 files. +

+

The availability of software to support these formats is somewhat +limited by platform, especially on Windows. +

+
+ +
+

+Previous: , Up: Binary files   [Contents][Index]

+
+ +

5.2 dBase files (DBF)

+ + + +

dBase was a DOS program written by Ashton-Tate and later owned by +Borland which has a binary flat-file format that became popular, with +file extension .dbf. It has been adopted for the ’Xbase’ family +of databases, covering dBase, Clipper, FoxPro and their Windows +equivalents Visual dBase, Visual Objects and Visual FoxPro (see +http://www.e-bachmann.dk/docs/xbase.htm). A dBase file contains +a header and then a series of fields and so is most similar to an R +data frame. The data itself is stored in text format, and can include +character, logical and numeric fields, and other types in later versions +(see for example +http://www.digitalpreservation.gov/formats/fdd/fdd000325.shtml +and +http://www.clicketyclick.dk/databases/xbase/format/index.html). +

+ + +

Functions read.dbf and write.dbf provide ways to read and +write basic DBF files on all R platforms. For Windows users +odbcConnectDbase in package RODBC provides more +comprehensive facilities to read DBF files via Microsoft’s dBase +ODBC driver (and the Visual FoxPro driver can also be used via +odbcDriverConnect). + +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

6 Image files

+ +

A particular class of binary files are those representing images, and a +not uncommon request is to read such a file into R as a matrix. +

+

There are many formats for image files (most with lots of variants), and +it may be necessary to use external conversion software to first convert +the image into one of the formats for which a package currently provides +an R reader. A versatile example of such software is ImageMagick and +its fork GraphicsMagick. These provide command-line programs +convert and gm convert to convert images from one +format to another: what formats they can input is determined when they +are compiled, and the supported formats can be listed by e.g. +convert -list format. +

+

Package pixmap has a function read.pnm to read ‘portable +anymap’ images in PBM (black/white), PGM (grey) and PPM (RGB colour) +formats. These are also known as ‘netpbm’ formats. +

+

Packages bmp, jpeg and png read the +formats after which they are named. See also packages biOps +and Momocs, and Bioconductor package EBImage. +

+

TIFF is more a meta-format, a wrapper within which a very large variety +of image formats can be embedded. Packages rtiff (orphaned) +and tiff can read some of the sub-formats (depending on the +external libtiff software against which they are compiled). +There some facilities for specialized sub-formats, for example in +Bioconductor package beadarray. +

+

Raster files are common in the geographical sciences, and package +rgdal provides an interface to GDAL which provides some +facilities of its own to read raster files and links to many others. +Which formats it supports is determined when GDAL is compiled: use +gdalDrivers() to see what these are for the build you are using. +It can be useful for uncommon formats such as JPEG 2000 (which is a +different format from JPEG, and not currently supported in the OS X +nor Windows binary versions of rgdal). +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

7 Connections

+ + +

Connections are used in R in the sense of Chambers (1998) and +Ripley (2001), a set of functions to replace the use of file names by a +flexible interface to file-like objects. +

+ + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Connections   [Contents][Index]

+
+ +

7.1 Types of connections

+ + + + +

The most familiar type of connection will be a file, and file +connections are created by function file. File connections can +(if the OS will allow it for the particular file) be opened for reading +or writing or appending, in text or binary mode. In fact, files can be +opened for both reading and writing, and R keeps a separate file +position for reading and writing. +

+ + +

Note that by default a connection is not opened when it is created. The +rule is that a function using a connection should open a connection +(needed) if the connection is not already open, and close a connection +after use if it opened it. In brief, leave the connection in the state +you found it in. There are generic functions open and +close with methods to explicitly open and close connections. +

+ + + +

Files compressed via the algorithm used by gzip can be used as +connections created by the function gzfile, whereas files +compressed by bzip2 can be used via bzfile. +

+ + + + +

Unix programmers are used to dealing with special files stdin, +stdout and stderr. These exist as terminal +connections in R. They may be normal files, but they might also +refer to input from and output to a GUI console. (Even with the standard +Unix R interface, stdin refers to the lines submitted from +readline rather than a file.) +

+

The three terminal connections are always open, and cannot be opened or +closed. stdout and stderr are conventionally used for +normal output and error messages respectively. They may normally go to +the same place, but whereas normal output can be re-directed by a call +to sink, error output is sent to stderr unless re-directed +by sink, type="message"). Note carefully the language used here: +the connections cannot be re-directed, but output can be sent to other +connections. +

+ + +

Text connections are another source of input. They allow R +character vectors to be read as if the lines were being read from a text +file. A text connection is created and opened by a call to +textConnection, which copies the current contents of the +character vector to an internal buffer at the time of creation. +

+

Text connections can also be used to capture R output to a character +vector. textConnection can be asked to create a new character +object or append to an existing one, in both cases in the user’s +workspace. The connection is opened by the call to +textConnection, and at all times the complete lines output to the +connection are available in the R object. Closing the connection +writes any remaining output to a final element of the character vector. +

+ + +

Pipes are a special form of file that connects to another +process, and pipe connections are created by the function pipe. +Opening a pipe connection for writing (it makes no sense to append to a +pipe) runs an OS command, and connects its standard input to whatever +R then writes to that connection. Conversely, opening a pipe +connection for input runs an OS command and makes its standard output +available for R input from that connection. +

+ + +

URLs of types ‘http://’, ‘ftp://’ and ‘file://’ +can be read from using the function url. For convenience, +file will also accept these as the file specification and call +url. On most platforms ‘https://’ are also accepted. +

+ + +

Sockets can also be used as connections via function +socketConnection on platforms which support Berkeley-like sockets +(most Unix systems, Linux and Windows). Sockets can be written to or +read from, and both client and server sockets can be used. +

+ +
+ + + +

7.2 Output to connections

+ + + + + + +

We have described functions cat, write, write.table +and sink as writing to a file, possibly appending to a file if +argument append = TRUE, and this is what they did prior to R +version 1.2.0. +

+

The current behaviour is equivalent, but what actually happens is that +when the file argument is a character string, a file connection +is opened (for writing or appending) and closed again at the end of the +function call. If we want to repeatedly write to the same file, it is +more efficient to explicitly declare and open the connection, and pass +the connection object to each call to an output function. This also +makes it possible to write to pipes, which was implemented earlier in a +limited way via the syntax file = "|cmd" (which can still be +used). +

+ +

There is a function writeLines to write complete text lines +to a connection. +

+

Some simple examples are +

+
+
zz <- file("ex.data", "w")  # open an output file connection
+cat("TITLE extra line", "2 3 5 7", "", "11 13 17", 
+    file = zz, sep = "\n")
+cat("One more line\n", file = zz)
+close(zz)
+
+## convert decimal point to comma in output, using a pipe (Unix)
+## both R strings and (probably) the shell need \ doubled
+zz <- pipe(paste("sed s/\\\\./,/ >", "outfile"), "w")
+cat(format(round(rnorm(100), 4)), sep = "\n", file = zz)
+close(zz)
+## now look at the output file:
+file.show("outfile", delete.file = TRUE)
+
+## capture R output: use examples from help(lm)
+zz <- textConnection("ex.lm.out", "w")
+sink(zz)
+example(lm, prompt.echo = "> ")
+sink()
+close(zz)
+## now ‘ex.lm.out’ contains the output for futher processing.
+## Look at it by, e.g.,
+cat(ex.lm.out, sep = "\n")
+
+ +
+ + + +

7.3 Input from connections

+ + + + +

The basic functions to read from connections are scan and +readLines. These take a character string argument and open a +file connection for the duration of the function call, but explicitly +opening a file connection allows a file to be read sequentially in +different formats. +

+

Other functions that call scan can also make use of connections, +in particular read.table. +

+

Some simple examples are +

+
+
## read in file created in last examples
+readLines("ex.data")
+unlink("ex.data")
+
+## read listing of current directory (Unix)
+readLines(pipe("ls -1"))
+
+# remove trailing commas from an input file.
+# Suppose we are given a file ‘data’ containing
+450, 390, 467, 654,  30, 542, 334, 432, 421,
+357, 497, 493, 550, 549, 467, 575, 578, 342,
+446, 547, 534, 495, 979, 479
+# Then read this by
+scan(pipe("sed -e s/,$// data"), sep=",")
+
+ + +

For convenience, if the file argument specifies a FTP or HTTP +URL, the URL is opened for reading via url. +Specifying files via ‘file://foo.bar’ is also allowed. +

+ + + + +
+ + + +

7.3.1 Pushback

+ + + +

C programmers may be familiar with the ungetc function to push +back a character onto a text input stream. R connections have the +same idea in a more powerful way, in that an (essentially) arbitrary +number of lines of text can be pushed back onto a connection via a call +to pushBack. +

+

Pushbacks operate as a stack, so a read request first uses each line +from the most recently pushbacked text, then those from earlier +pushbacks and finally reads from the connection itself. Once a +pushbacked line is read completely, it is cleared. The number of +pending lines pushed back can be found via a call to +pushBackLength. + +

+

A simple example will show the idea. +

+
+
> zz <- textConnection(LETTERS)
+> readLines(zz, 2)
+[1] "A" "B"
+> scan(zz, "", 4)
+Read 4 items
+[1] "C" "D" "E" "F"
+> pushBack(c("aa", "bb"), zz)
+> scan(zz, "", 4)
+Read 4 items
+[1] "aa" "bb" "G"  "H" 
+> close(zz)
+
+ +

Pushback is only available for connections opened for input in text mode. +

+
+ + + +

7.4 Listing and manipulating connections

+ + + +

A summary of all the connections currently opened by the user can be +found by showConnections(), and a summary of all connections, +including closed and terminal connections, by showConnections(all += TRUE) +

+ + +

The generic function seek can be used to read and (on some +connections) reset the current position for reading or writing. +Unfortunately it depends on OS facilities which may be unreliable +(e.g. with text files under Windows). Function isSeekable +reports if seek can change the position on the connection +given by its argument. +

+ +

The function truncate can be used to truncate a file opened for +writing at its current position. It works only for file +connections, and is not implemented on all platforms. +

+ +
+ + + +

7.5 Binary connections

+ + + + +

Functions readBin and writeBin read to and write from +binary connections. A connection is opened in binary mode by appending +"b" to the mode specification, that is using mode "rb" for +reading, and mode "wb" or "ab" (where appropriate) for +writing. The functions have arguments +

+
+
readBin(con, what, n = 1, size = NA, endian = .Platform$endian) 
+writeBin(object, con, size = NA, endian = .Platform$endian) 
+
+ +

In each case con is a connection which will be opened if +necessary for the duration of the call, and if a character string is +given it is assumed to specify a file name. +

+

It is slightly simpler to describe writing, so we will do that first. +object should be an atomic vector object, that is a vector of +mode numeric, integer, logical, character, +complex or raw, without attributes. By default this is +written to the file as a stream of bytes exactly as it is represented in +memory. +

+

readBin reads a stream of bytes from the file and interprets them +as a vector of mode given by what. This can be either an object +of the appropriate mode (e.g. what=integer()) or a character +string describing the mode (one of the five given in the previous +paragraph or "double" or "int"). Argument n +specifies the maximum number of vector elements to read from the +connection: if fewer are available a shorter vector will be returned. +Argument signed allows 1-byte and 2-byte integers to be +read as signed (the default) or unsigned integers. +

+

The remaining two arguments are used to write or read data for +interchange with another program or another platform. By default binary +data is transferred directly from memory to the connection or vice +versa. This will not suffice if the data are to be transferred to a +machine with a different architecture, but between almost all R +platforms the only change needed is that of byte-order. Common PCs +(‘ix86’-based and ‘x86_64’-based machines), Compaq Alpha +and Vaxen are little-endian, whereas Sun Sparc, mc680x0 series, +IBM R6000, SGI and most others are big-endian. (Network +byte-order (as used by XDR, eXternal Data Representation) is +big-endian.) To transfer to or from other programs we may need to do +more, for example to read 16-bit integers or write single-precision real +numbers. This can be done using the size argument, which +(usually) allows sizes 1, 2, 4, 8 for integers and logicals, and sizes +4, 8 and perhaps 12 or 16 for reals. Transferring at different sizes +can lose precision, and should not be attempted for vectors containing +NA’s. +

+ + +

Character strings are read and written in C format, that is as a string +of bytes terminated by a zero byte. Functions readChar and +writeChar provide greater flexibility. +

+ + + + + +
+ + + +

7.5.1 Special values

+ +

Functions readBin and writeBin will pass missing and +special values, although this should not be attempted if a size change +is involved. +

+

The missing value for R logical and integer types is INT_MIN, +the smallest representable int defined in the C header +limits.h, normally corresponding to the bit pattern +0x80000000. +

+

The representation of the special values for R numeric and complex +types is machine-dependent, and possibly also compiler-dependent. The +simplest way to make use of them is to link an external application +against the standalone Rmath library which exports double +constants NA_REAL, R_PosInf and R_NegInf, and +include the header Rmath.h which defines the macros ISNAN +and R_FINITE. +

+

If that is not possible, on all current platforms IEC 60559 (aka IEEE +754) arithmetic is used, so standard C facilities can be used to test +for or set Inf, -Inf and NaN values. On such +platforms NA is represented by the NaN value with low-word +0x7a2 (1954 in decimal). +

+

Character missing values are written as NA, and there are no +provision to recognize character values as missing (as this can be done +by re-assigning them once read). +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

8 Network interfaces

+ + + + + + +

Some limited facilities are available to exchange data at a lower level +across network connections. +

+
+ + + +

8.1 Reading from sockets

+ + +

Base R comes with some facilities to communicate via +BSD sockets on systems that support them (including the common +Linux, Unix and Windows ports of R). One potential problem with +using sockets is that these facilities are often blocked for security +reasons or to force the use of Web caches, so these functions may be +more useful on an intranet than externally. For new projects it +is suggested that socket connections are used instead. +

+ + + + +

The earlier low-level interface is given by functions make.socket, +read.socket, write.socket and close.socket. +

+ +
+ + + +

8.2 Using download.file

+ +

Function download.file is provided to read a file from a +Web resource via FTP or HTTP and write it to a file. Often this can be +avoided, as functions such as read.table and scan can read +directly from a URL, either by explicitly using url to open a +connection, or implicitly using it by giving a URL as the file +argument. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

9 Reading Excel spreadsheets

+ +

The most common R data import/export question seems to be ‘how do I read +an Excel spreadsheet’. This chapter collects together advice and +options given earlier. Note that most of the advice is for pre-Excel +2007 spreadsheets and not the later .xlsx format. +

+ + + + + +

The first piece of advice is to avoid doing so if possible! If you have +access to Excel, export the data you want from Excel in tab-delimited or +comma-separated form, and use read.delim or read.csv to +import it into R. (You may need to use read.delim2 or +read.csv2 in a locale that uses comma as the decimal point.) +Exporting a DIF file and reading it using read.DIF is another +possibility. +

+

If you do not have Excel, many other programs are able to read such +spreadsheets and export in a text format on both Windows and Unix, for +example Gnumeric (http://www.gnome.org/projects/gnumeric/) and +OpenOffice (https://www.openoffice.org). You can also +cut-and-paste between the display of a spreadsheet in such a program and +R: read.table will read from the R console or, under Windows, +from the clipboard (via file = "clipboard" or +readClipboard). The read.DIF function can also read from +the clipboard. +

+

Note that an Excel .xls file is not just a spreadsheet: such +files can contain many sheets, and the sheets can contain formulae, +macros and so on. Not all readers can read other than the first sheet, +and may be confused by other contents of the file. +

+ + +

Windows users (of 32-bit R) can use odbcConnectExcel in +package RODBC. This can select rows and columns from any of the +sheets in an Excel spreadsheet file (at least from Excel 97–2003, +depending on your ODBC drivers: by calling odbcConnect directly +versions back to Excel 3.0 can be read). The version +odbcConnectExcel2007 will read the Excel 2007 formats as well as +earlier ones (provided the drivers are installed, including with 64-bit +Windows R: see RODBC). OS X users can also use RODBC if +they have a suitable driver (e.g. that from Actual Technologies). +

+ +

Perl users have contributed a module +OLE::SpreadSheet::ParseExcel and a program xls2csv.pl to +convert Excel 95–2003 spreadsheets to CSV files. Package gdata +provides a basic wrapper in its read.xls function. With suitable +Perl modules installed this function can also read Excel 2007 +spreadsheets. +

+ +

32-bit Windows package xlsReadWrite from +http://www.swissr.org/ and CRAN has a function read.xls to +read .xls files (based on a third-party non-Open-Source Delphi +component). +

+ + +

Packages dataframes2xls and WriteXLS each contain a function +to write one or more data frames to an .xls file, using +Python and Perl respectively. Another version of write.xls in +available in package xlsReadWrite. +

+ + +

Two packages which can read and and manipulate Excel 2007/10 +spreadsheets but not earlier formats are xlsx (which requires +Java) and the Omegahat package RExcelXML. +

+ +

Package XLConnect can read, write and manipulate both Excel +97–2003 and Excel 2007/10 spreadsheets, requiring Java. +

+ +
+ + + +

Appendix A References

+ +

R. A. Becker, J. M. Chambers and A. R. Wilks (1988) +The New S Language. A Programming Environment for Data Analysis +and Graphics. Wadsworth & Brooks/Cole. +

+

J. Bowman, S. Emberson and M. Darnovsky (1996) The +Practical SQL Handbook. Using Structured Query Language. +Addison-Wesley. +

+

J. M. Chambers (1998) Programming with Data. A Guide to the S +Language. Springer-Verlag. +

+

P. Dubois (2000) MySQL. New Riders. +

+

M. Henning and S. Vinoski (1999) Advanced CORBA Programming +with C++. Addison-Wesley. +

+

K. Kline and D. Kline (2001) SQL in a Nutshell. O’Reilly. +

+

B. Momjian (2000) PostgreSQL: Introduction and Concepts. +Addison-Wesley. +Also available at http://momjian.us/main/writings/pgsql/aw_pgsql_book/. +

+

B. D. Ripley (2001) Connections. \R News, 1/1, 16–7. + \https://www.r-project.org/doc/Rnews/Rnews_2001-1.pdf +

+ +

T. M. Therneau and P. M. Grambsch (2000) Modeling Survival +Data. Extending the Cox Model. Springer-Verlag. +

+

E. J. Yarger, G. Reese and T. King (1999) MySQL & mSQL. +O’Reilly. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Function and variable index

+ +
Jump to:   . +   +
+B +   +C +   +D +   +F +   +G +   +H +   +I +   +M +   +N +   +O +   +P +   +R +   +S +   +T +   +U +   +W +   +X +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

.
.dbf: RODBC
.xls: RODBC
.xls: RODBC

B
bzfile: Types of connections

C
cat: Export to text files
cat: Output to connections
close: RODBC
close: Types of connections
close.socket: Reading from sockets
count.fields: Variations on read.table

D
data.restore: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
dataframes2xls: Reading Excel spreadsheets
dbClearResult: DBI
dbConnect: DBI
dbDisconnect: DBI
dbDriver: DBI
dbExistsTable: DBI
dbGetQuery: DBI
dbReadTable: DBI
dbRemoveTable: DBI
dbSendQuery: DBI
dbWriteTable: DBI

F
fetch: DBI
file: Types of connections
format: Export to text files
ftable: Flat contingency tables

G
gzfile: Types of connections

H
hdf5: Binary data formats

I
isSeekable: Listing and manipulating connections

M
make.socket: Reading from sockets

N
netCDF: Binary data formats

O
odbcClose: RODBC
odbcConnect: RODBC
odbcConnectDbase: dBase files (DBF)
odbcConnectExcel: RODBC
odbcConnectExcel: Reading Excel spreadsheets
odbcConnectExcel2007: Reading Excel spreadsheets
odbcDriverConnect: RODBC
odbcGetInfo: RODBC
odbcQuery: RODBC
open: Types of connections

P
pipe: Types of connections
pushBack.: Pushback
pushBackLength: Pushback

R
read.csv: Variations on read.table
read.csv: Reading Excel spreadsheets
read.csv2: Variations on read.table
read.dbf: dBase files (DBF)
read.delim: Variations on read.table
read.delim: Reading Excel spreadsheets
read.delim2: Variations on read.table
read.DIF: Data Interchange Format (DIF)
read.DIF: Reading Excel spreadsheets
read.dta: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
read.epiinfo: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
read.fortran: Fixed-width-format files
read.ftable: Flat contingency tables
read.fwf: Fixed-width-format files
read.mtp: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
read.octave: Octave
read.S: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
read.socket: Reading from sockets
read.spss: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
read.systat: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
read.table: Variations on read.table
read.table: Input from connections
read.table: Reading Excel spreadsheets
read.xls: Reading Excel spreadsheets
read.xport: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
readBin: Binary connections
readChar: Binary connections
readClipboard: Reading Excel spreadsheets
readLines: Using scan directly
readLines: Input from connections
reshape: Re-shaping data
RExcelXML: Reading Excel spreadsheets

S
scan: Imports
scan: Using scan directly
scan: Input from connections
seek: Listing and manipulating connections
showConnections: Listing and manipulating connections
sink: Export to text files
sink: Output to connections
socketConnection: Types of connections
sqlCopy: RODBC
sqlFetch: RODBC
sqlFetchMore: RODBC
sqlGetResults: RODBC
sqlQuery: RODBC
sqlSave: RODBC
sqlTables: RODBC
stack: Re-shaping data
stderr: Types of connections
stdin: Types of connections
stdout: Types of connections
Sys.localeconv: Variations on read.table

T
textConnection: Types of connections
truncate: Listing and manipulating connections

U
unstack.: Re-shaping data
url: Types of connections

W
write: Export to text files
write: Output to connections
write.csv: Export to text files
write.csv2: Export to text files
write.dbf: dBase files (DBF)
write.dta: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
write.foreign: Export to text files
write.matrix: Export to text files
write.socket: Reading from sockets
write.table: Export to text files
write.table: Output to connections
writeBin: Binary connections
writeChar: Binary connections
writeLines: Output to connections
WriteXLS: Reading Excel spreadsheets

X
XLConnect: Reading Excel spreadsheets
xlsReadWrite: Reading Excel spreadsheets
xlsx: Reading Excel spreadsheets

+
Jump to:   . +   +
+B +   +C +   +D +   +F +   +G +   +H +   +I +   +M +   +N +   +O +   +P +   +R +   +S +   +T +   +U +   +W +   +X +   +
+ +
+ +
+

+Previous: , Up: Top   [Contents][Index]

+
+ +

Concept index

+ +
Jump to:   A +   +B +   +C +   +D +   +E +   +F +   +H +   +I +   +L +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +X +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

A
AWK: Introduction

B
Binary files: Binary files
Binary files: Binary connections

C
comma separated values: Export to text files
Compressed files: Types of connections
Connections: Connections
Connections: Types of connections
Connections: Output to connections
Connections: Listing and manipulating connections
CSV files: Export to text files
CSV files: Variations on read.table

D
Data Interchange Format (DIF): Data Interchange Format (DIF)
Dbase: RODBC
dBase: dBase files (DBF)
DBF files: dBase files (DBF)
DBMS: Relational databases

E
Encodings: Encodings
Encodings: Export to text files
EpiData: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
EpiInfo: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
Excel: RODBC
Excel: RODBC
Exporting to a text file: Export to text files

F
File connections: Types of connections
Fixed-width-format files: Fixed-width-format files
Flat contingency tables: Flat contingency tables

H
Hierarchical Data Format: Binary data formats

I
Importing from other statistical systems: Importing from other statistical systems

L
locales: Variations on read.table

M
Minitab: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
Missing values: Export to text files
Missing values: Variations on read.table
MySQL database system: DBI
MySQL database system: RODBC

N
network Common Data Form: Binary data formats

O
Octave: Octave
ODBC: Overview of RDBMSs
ODBC: RODBC
Open Database Connectivity: Overview of RDBMSs
Open Database Connectivity: RODBC

P
perl: Introduction
perl: Fixed-width-format files
Pipe connections: Types of connections
PostgreSQL database system: RODBC
Pushback on a connection: Pushback

Q
Quoting strings: Export to text files
Quoting strings: Variations on read.table

R
Re-shaping data: Re-shaping data
Relational databases: Relational databases

S
S-PLUS: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
SAS: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
Sockets: Types of connections
Sockets: Reading from sockets
Spreadsheet-like data: Spreadsheet-like data
SPSS: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
SPSS Data Entry: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
SQL queries: SQL queries
Stata: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat
Systat: EpiInfo Minitab SAS S-PLUS SPSS Stata Systat

T
Terminal connections: Types of connections
Text connections: Types of connections

U
Unix tools: Introduction
URL connections: Types of connections
URL connections: Input from connections

X
XML: XML

+
Jump to:   A +   +B +   +C +   +D +   +E +   +F +   +H +   +I +   +L +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +X +   +
+ + +
+
+

Footnotes

+ +

(1)

+

Currently not available from +that repository but as a source package for download from +http://www.omegahat.org/ROpenOffice/.

+

(2)

+

the +distinction is subtle, +https://en.wikipedia.org/wiki/UTF-16/UCS-2, and the use of +surrogate pairs is very rare.

+

(3)

+

Even then, +Windows applications may expect a Byte Order Mark which the +implementation of iconv used by R may or may not add depending +on the platform.

+

(4)

+

This is normally +fast as looking at the first entry rules out most of the possibilities.

+

(5)

+

and forks, notably MariaDB.

+
+
+ + + + + diff --git a/R-exts.html b/R-exts.html new file mode 100644 index 0000000..1ea4b62 --- /dev/null +++ b/R-exts.html @@ -0,0 +1,15950 @@ + + + + + +Writing R Extensions + + + + + + + + + + + + + + + + +

Writing R Extensions

+ + + + + + + + + + + + + + + + + + + + + + + +

Table of Contents

+ +
+ + +
+ + + +
+

+Next:   [Contents][Index]

+
+ +

Writing R Extensions

+ +

This is a guide to extending R, describing the process of creating +R add-on packages, writing R documentation, R’s system and +foreign language interfaces, and the R API. +

+

This manual is for R, version 3.2.3 (2015-12-10). +

+

Copyright © 1999–2015 R Core Team +

+
+

Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. +

+

Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided that the +entire resulting derived work is distributed under the terms of a +permission notice identical to this one. +

+

Permission is granted to copy and distribute translations of this manual +into another language, under the above conditions for modified versions, +except that this permission notice may be stated in a translation +approved by the R Core Team. +

+ + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Acknowledgements

+ + +

The contributions to early versions of this manual by Saikat DebRoy +(who wrote the first draft of a guide to using .Call and +.External) and Adrian Trapletti (who provided information on the +C++ interface) are gratefully acknowledged. +

+
+ + + +

1 Creating R packages

+ + + +

Packages provide a mechanism for loading optional code, data and +documentation as needed. The R distribution itself includes about 30 +packages. +

+

In the following, we assume that you know the library() command, +including its lib.loc argument, and we also assume basic +knowledge of the R CMD INSTALL utility. Otherwise, please +look at R’s help pages on +

+
+
?library
+?INSTALL
+
+ +

before reading on. +

+

For packages which contain code to be compiled, a computing environment +including a number of tools is assumed; the “R Installation and +Administration” manual describes what is needed for each OS. +

+

Once a source package is created, it must be installed by +the command R CMD INSTALL. +See Add-on-packages in R Installation and Administration. +

+

Other types of extensions are supported (but rare): See Package types. +

+

Some notes on terminology complete this introduction. These will help +with the reading of this manual, and also in describing concepts +accurately when asking for help. +

+

A package is a directory of files which extend R, a +source package (the master files of a package), or a tarball +containing the files of a source package, or an installed +package, the result of running R CMD INSTALL on a source +package. On some platforms (notably OS X and Windows) there are also +binary packages, a zip file or tarball containing the files of an +installed package which can be unpacked rather than installing from +sources. +

+

A package is not1 a +library. The latter is used in two senses in R documentation. +

+
    +
  • A directory into which packages are installed, e.g. +/usr/lib/R/library: in that sense it is sometimes referred to as +a library directory or library tree (since the library is +a directory which contains packages as directories, which themselves +contain directories). + +
  • That used by the operating system, as a shared, dynamic or static +library or (especially on Windows) a DLL, where the second L stands for +‘library’. Installed packages may contain compiled code in what is +known on Unix-alikes as a shared object and on Windows as a DLL. +The concept of a shared library (dynamic library on OS X) +as a collection of compiled code to which a package might link is also +used, especially for R itself on some platforms. On most platforms +these concepts are interchangeable (shared objects and DLLs can both be +loaded into the R process and be linked against), but OS X +distinguishes between shared objects (extension .so) and dynamic +libraries (extension .dylib). + +
+ +

There are a number of well-defined operations on source packages. +

+
    +
  • The most common is installation which takes a source package and +installs it in a library using R CMD INSTALL or +install.packages. + +
  • Source packages can be built. This involves taking a source +directory and creating a tarball ready for distribution, including +cleaning it up and creating PDF documentation from any vignettes +it may contain. Source packages (and most often tarballs) can be +checked, when a test installation is done and tested (including +running its examples); also, the contents of the package are tested in +various ways for consistency and portability. + +
  • Compilation is not a correct term for a package. Installing a +source package which contains C, C++ or Fortran code will involve +compiling that code. There is also the possibility of ‘byte’ compiling +the R code in a package (using the facilities of package +compiler): already base and recommended packages are normally +byte-compiled and this can be specified for other packages. So +compiling a package may come to mean byte-compiling its R +code. + +
  • It used to be unambiguous to talk about loading an installed +package using library(), but since the advent of package +namespaces this has been less clear: people now often talk about +loading the package’s namespace and then attaching the +package so it becomes visible on the search path. Function +library performs both steps, but a package’s namespace can be +loaded without the package being attached (for example by calls like +splines::ns). + +
+ +

The concept of lazy loading of code or data is mentioned at +several points. This is part of the installation, always selected for +R code but optional for data. When used the R objects of the +package are created at installation time and stored in a database in the +R directory of the installed package, being loaded into the +session at first use. This makes the R session start up faster and +use less (virtual) memory. +(For technical details, +see Lazy loading in R Internals.) +

+ +

CRAN is a network of WWW sites holding the R distributions +and contributed code, especially R packages. Users of R are +encouraged to join in the collaborative project and to submit their own +packages to CRAN: current instructions are linked from +https://CRAN.R-project.org/banner.shtml#submitting. +

+ + + + + + + + + + + + + + + +
+ + + +

1.1 Package structure

+ + +

The sources of an R package consists of a subdirectory containing a +files DESCRIPTION and NAMESPACE, and the subdirectories +R, data, demo, exec, inst, +man, po, src, tests, tools and +vignettes (some of which can be missing, but which should not be +empty). The package subdirectory may also contain files INDEX, +configure, cleanup, LICENSE, LICENCE and +NEWS. Other files such as INSTALL (for non-standard +installation instructions), README/README.md2, or ChangeLog will be ignored by R, but may +be useful to end users. The utility R CMD build may add files +in a build directory (but this should not be used for other +purposes). +

+

Except where specifically mentioned,3 packages should not contain +Unix-style ‘hidden’ files/directories (that is, those whose name starts +with a dot). +

+

The DESCRIPTION and INDEX files are described in the +subsections below. The NAMESPACE file is described in the +section on Package namespaces. +

+ + + +

The optional files configure and cleanup are (Bourne +shell) script files which are, respectively, executed before and +(provided that option --clean was given) after installation on +Unix-alikes, see Configure and cleanup. The analogues on Windows +are configure.win and cleanup.win. +

+

For the conventions for files NEWS and ChangeLog in the +GNU project see +https://www.gnu.org/prep/standards/standards.html#Documentation. +

+

The package subdirectory should be given the same name as the package. +Because some file systems (e.g., those on Windows and by default on OS +X) are not case-sensitive, to maintain portability it is strongly +recommended that case distinctions not be used to distinguish different +packages. For example, if you have a package named foo, do not +also create a package named Foo. +

+

To ensure that file names are valid across file systems and supported +operating systems, the ASCII control characters as well as the +characters ‘"’, ‘*’, ‘:’, ‘/’, ‘<’, ‘>’, +‘?’, ‘\’, and ‘|’ are not allowed in file names. In +addition, files with names ‘con’, ‘prn’, ‘aux’, +‘clock$’, ‘nul’, ‘com1’ to ‘com9’, and ‘lpt1’ +to ‘lpt9’ after conversion to lower case and stripping possible +“extensions” (e.g., ‘lpt5.foo.bar’), are disallowed. Also, file +names in the same directory must not differ only by case (see the +previous paragraph). In addition, the basenames of ‘.Rd’ files may +be used in URLs and so must be ASCII and not contain %. +For maximal portability filenames should only contain only +ASCII characters not excluded already (that is +A-Za-z0-9._!#$%&+,;=@^(){}'[] — we exclude space as many +utilities do not accept spaces in file paths): non-English alphabetic +characters cannot be guaranteed to be supported in all locales. It +would be good practice to avoid the shell metacharacters +(){}'[]$~: ~ is also used as part of ‘8.3’ filenames on +Windows. In addition, packages are normally distributed as tarballs, +and these have a limit on path lengths: for maximal portability 100 +bytes. +

+

A source package if possible should not contain binary executable files: +they are not portable, and a security risk if they are of the +appropriate architecture. R CMD check will warn about +them4 unless they are listed (one filepath per line) in a file +BinaryFiles at the top level of the package. Note that +CRAN will not accept submissions containing binary files +even if they are listed. +

+

The R function package.skeleton can help to create the +structure for a new package: see its help page for details. +

+ + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Package structure   [Contents][Index]

+
+ +

1.1.1 The DESCRIPTION file

+ + +

The DESCRIPTION file contains basic information about the package +in the following format: +

+
+
+
+
Package: pkgname
+Version: 0.5-1
+Date: 2015-01-01
+Title: My First Collection of Functions
+Authors@R: c(person("Joe", "Developer", role = c("aut", "cre"),
+		     email = "Joe.Developer@some.domain.net"),
+	      person("Pat", "Developer", role = "aut"),
+	      person("A.", "User", role = "ctb",
+		     email = "A.User@whereever.net"))
+Author: Joe Developer [aut, cre],
+  Pat Developer [aut],
+  A. User [ctb]
+Maintainer: Joe Developer <Joe.Developer@some.domain.net>
+Depends: R (>= 3.1.0), nlme
+Suggests: MASS
+Description: A (one paragraph) description of what
+  the package does and why it may be useful.
+License: GPL (>= 2)
+URL: https://www.r-project.org, http://www.another.url
+BugReports: https://pkgname.bugtracker.url
+
+
+
+ +

The format is that of a version of a ‘Debian Control File’ (see the help +for ‘read.dcf’ and +https://www.debian.org/doc/debian-policy/ch-controlfields.html: +R does not require encoding in UTF-8 and does not support comments +starting with ‘#’). Fields start with an ASCII name +immediately followed by a colon: the value starts after the colon and a +space. Continuation lines (for example, for descriptions longer than +one line) start with a space or tab. Field names are case-sensitive: +all those used by R are capitalized. +

+

For maximal portability, the DESCRIPTION file should be written +entirely in ASCII — if this is not possible it must contain +an ‘Encoding’ field (see below). +

+

Several optional fields take logical values: these can be +specified as ‘yes’, ‘true’, ‘no’ or ‘false’: +capitalized values are also accepted. +

+

The ‘Package’, ‘Version’, ‘License’, ‘Description’, +‘Title’, ‘Author’, and ‘Maintainer’ fields are mandatory, +all other fields are optional. Fields ‘Author’ and +‘Maintainer’ can be auto-generated from ‘Authors@R’, and may +be omitted if the latter is provided: however if they are not +ASCII we recommend that they are provided. +

+

The mandatory ‘Package’ field gives the name of the package. This +should contain only (ASCII) letters, numbers and dot, have at +least two characters and start with a letter and not end in a dot. If +it needs explaining, this should be done in the ‘Description’ field +(and not the ‘Title’ field). +

+

The mandatory ‘Version’ field gives the version of the package. +This is a sequence of at least two (and usually three) +non-negative integers separated by single ‘.’ or ‘-’ +characters. The canonical form is as shown in the example, and a +version such as ‘0.01’ or ‘0.01.0’ will be handled as if it +were ‘0.1-0’. It is not a decimal number, so for example +0.9 < 0.75 since 9 < 75. +

+

The mandatory ‘License’ field is discussed in the next subsection. +

+

The mandatory ‘Title’ field should give a short description +of the package. Some package listings may truncate the title to 65 +characters. It should use title case (that is, use capitals for +the principal words: tools::toTitleCase can help you with this), +not use any markup, not have any continuation lines, and not end in a +period (unless part of …). Do not repeat the package name: it is +often used prefixed by the name. Refer to other packages and external +software in single quotes, and to book titles (and similar) in double +quotes. +

+

The mandatory ‘Description’ field should give a +comprehensive description of what the package does. One can use +several (complete) sentences, but only one paragraph. It should be +intelligible to all the intended readership (e.g. for a CRAN +package to all CRAN users). It is good practice not to start +with the package name, ‘This package’ or similar. As with the +‘Title’ field, double quotes should be used for quotations +(including titles of books and articles), and single quotes for +non-English usage, including names of other packages and external +software. This field should also be used for explaining the package +name if necessary. URLs should be enclosed in angle brackets, e.g. +‘<https://www.r-project.org>’: see also Specifying URLs. +

+

The mandatory ‘Author’ field describes who wrote the +package. It is a plain text field intended for human readers, but not +for automatic processing (such as extracting the email addresses of all +listed contributors: for that use ‘Authors@R’). Note that all +significant contributors must be included: if you wrote an R wrapper +for the work of others included in the src directory, you are not +the sole (and maybe not even the main) author. +

+

The mandatory ‘Maintainer’ field should give a single name +followed by a valid (RFC 2822) email address in angle brackets. It +should not end in a period or comma. This field is what is reported by +the maintainer function and used by bug.report. For a +CRAN package it should be a person, not a mailing list +and not a corporate entity: do ensure that it is valid and will remain +valid for the lifetime of the package. +

+

Note that the display name (the part before the address in angle +brackets) should be enclosed in double quotes if it contains +non-alphanumeric characters such as comma or period. (The current +standard, RFC 5322, allows periods but RFC 2822 did not.) +

+

Both ‘Author’ and ‘Maintainer’ fields can be omitted if a +suitable ‘Authors@R’ field is given. This field can be used to +provide a refined and machine-readable description of the package +“authors” (in particular specifying their precise roles), via +suitable R code. It should create an object of class "person', +by either a call to person or a series of calls (one per +“author”) concatenated by c()): see the example +DESCRIPTION file above. The roles can include ‘"aut"’ +(author) for full authors, ‘"cre"’ (creator) for the package +maintainer, and ‘"ctb"’ (contributor) for other contributors, +‘"cph"’ (copyright holder), among others. See ?person for +more information. Note that no role is assumed by default. +Auto-generated package citation information takes advantage of this +specification. The ‘Author’ and ‘Maintainer’ fields are +auto-generated from it if needed when building5 or installing. +

+ +

An optional ‘Copyright’ field can be used where the copyright +holder(s) are not the authors. If necessary, this can refer to an +installed file: the convention is to use file inst/COPYRIGHTS. +

+

The optional ‘Date’ field gives the release date of the +current version of the package. It is strongly recommended6 to use the ‘yyyy-mm-dd’ format conforming to the ISO +8601 standard. +

+

The ‘Depends’, ‘Imports’, ‘Suggests’, ‘Enhances’, +‘LinkingTo’ and ‘Additional_repositories’ fields are discussed +in a later subsection. +

+

Dependencies external to the R system should be listed in the +‘SystemRequirements’ field, possibly amplified in a separate +README file. +

+

The ‘URL’ field may give a list of URLs +separated by commas or whitespace, for example the homepage of the +author or a page where additional material describing the software can +be found. These URLs are converted to active hyperlinks in +CRAN package listings. See Specifying URLs. +

+

The ‘BugReports’ field may contain a single +URL to which bug reports about the package should be +submitted. This URL will be used by bug.report +instead of sending an email to the maintainer. +

+

Base and recommended packages (i.e., packages contained in the R +source distribution or available from CRAN and recommended to +be included in every binary distribution of R) have a ‘Priority’ +field with value ‘base’ or ‘recommended’, respectively. These +priorities must not be used by other packages. +

+

A ‘Collate’ field can be used for controlling the collation order +for the R code files in a package when these are processed for +package installation. The default is to collate according to the +‘C’ locale. If present, the collate specification must list +all R code files in the package (taking possible OS-specific +subdirectories into account, see Package subdirectories) as a +whitespace separated list of file paths relative to the R +subdirectory. +Paths containing white space or quotes need to be quoted. An +OS-specific collation field (‘Collate.unix’ or +‘Collate.windows’) will be used in preference to ‘Collate’. +

+

The ‘LazyData’ logical field controls whether the R datasets use +lazy-loading. A ‘LazyLoad’ field was used in versions prior to +2.14.0, but now is ignored. +

+

The ‘KeepSource’ logical field controls if the package code is sourced +using keep.source = TRUE or FALSE: it might be needed +exceptionally for a package designed to always be used with +keep.source = TRUE. +

+

The ‘ByteCompile’ logical field controls if the package code is to +be byte-compiled on installation: the default is currently not to, so +this may be useful for a package known to benefit particularly from +byte-compilation (which can take quite a long time and increases the +installed size of the package). It is used for the recommended +packages, as they are byte-compiled when R is installed and for +consistency should be byte-compiled when updated. This can be overridden +by installing with flag --no-byte-compile. +

+

The ‘ZipData’ logical field was used to control whether the automatic +Windows build would zip up the data directory or not prior to R +2.13.0: it is now ignored. +

+

The ‘Biarch’ logical field is used on Windows to select the +INSTALL option --force-biarch for this package. +(Introduced in R 3.0.0.) +

+

The ‘BuildVignettes’ logical field can be set to a false value to +stop R CMD build from attempting to build the vignettes, as +well as preventing7 R CMD check from testing +this. This should only be used exceptionally, for example if the PDFs +include large figures which are not part of the package sources (and +hence only in packages which do not have an Open Source license). +

+

The ‘VignetteBuilder’ field names (in a comma-separated list) +packages that provide an engine for building vignettes. These may +include the current package, or ones listed in ‘Depends’, +‘Suggests’ or ‘Imports’. The utils package is always +implicitly appended. See Non-Sweave vignettes for +details. +

+

If the DESCRIPTION file is not entirely in ASCII it +should contain an ‘Encoding’ field specifying an encoding. This is +used as the encoding of the DESCRIPTION file itself and of the +R and NAMESPACE files, and as the default encoding of +.Rd files. The examples are assumed to be in this encoding when +running R CMD check, and it is used for the encoding of the +CITATION file. Only encoding names latin1, latin2 +and UTF-8 are known to be portable. (Do not specify an encoding +unless one is actually needed: doing so makes the package less +portable. If a package has a specified encoding, you should run +R CMD build etc in a locale using that encoding.) +

+

The ‘NeedsCompilation’ field should be set to "yes" if the +package contains code which to be compiled, otherwise "no" (when +the package could be installed from source on any platform without +additional tools). This is used by install.packages(type = +"both") in R >= 2.15.2 on platforms where binary packages are the +norm: it is normally set by R CMD build or the repository +assuming compilation is required if and only if the package has a +src directory. +

+

The ‘OS_type’ field specifies the OS(es) for which the +package is intended. If present, it should be one of unix or +windows, and indicates that the package can only be installed +on a platform with ‘.Platform$OS.type’ having that value. +

+

The ‘Type’ field specifies the type of the package: +see Package types. +

+

One can add subject classifications for the content of the package using +the fields ‘Classification/ACM’ or ‘Classification/ACM-2012’ +(using the Computing Classification System of the Association for +Computing Machinery, http://www.acm.org/class/; the former refers +to the 1998 version), ‘Classification/JEL’ (the Journal of Economic +Literature Classification System, +https://www.aeaweb.org/econlit/jelCodes.php, or +‘Classification/MSC’ or ‘Classification/MSC-2010’ (the +Mathematics Subject Classification of the American Mathematical Society, +http://www.ams.org/msc/; the former refers to the 2000 version). +The subject classifications should be comma-separated lists of the +respective classification codes, e.g., ‘Classification/ACM: G.4, +H.2.8, I.5.1’. +

+

A ‘Language’ field can be used to indicate if the package +documentation is not in English: this should be a comma-separated list +of standard (not private use or grandfathered) IETF language tags as +currently defined by RFC 5646 +(https://tools.ietf.org/html/rfc5646, see also +https://en.wikipedia.org/wiki/IETF_language_tag), i.e., use +language subtags which in essence are 2-letter ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) or 3-letter ISO +639-3 (https://en.wikipedia.org/wiki/ISO_639-3) language +codes. +

+

As of R 3.2.0, an ‘RdMacros’ field can be used to hold a +comma-separated list of packages from which the current package will import +Rd macro definitions. These will be imported after the system macros, +in the order listed in the ‘RdMacros’ field, before any macro +definitions in the current package are loaded. Macro definitions in +individual .Rd files in the man directory are loaded +last, and are local to later parts of that file. In case of any +duplicates, the last loaded definition will be used8 +

+
+

Note: There should be no ‘Built’ or ‘Packaged’ fields, as these are +added by the package management tools. +

+ +

There is no restriction on the use of other fields not mentioned here +(but using other capitalizations of these field names would cause +confusion). Fields Note, Contact (for contacting the +authors/developers) and MailingList are in common use. Some +repositories (including CRAN and R-forge) add their own +fields. +

+ + +
+ + + +

1.1.2 Licensing

+ +

Licensing for a package which might be distributed is an important but +potentially complex subject. +

+

It is very important that you include license information! Otherwise, +it may not even be legally correct for others to distribute copies of +the package, let alone use it. +

+

The package management tools use the concept of +‘free or open source software’ +(FOSS, e.g., https://en.wikipedia.org/wiki/FOSS) +licenses: the idea being that some users of R and its packages want +to restrict themselves to such software. Others need to ensure that +there are no restrictions stopping them using a package, e.g. +forbidding commercial or military use. It is a central tenet of FOSS +software that there are no restrictions on users nor usage. +

+

Do not use the ‘License’ field for information on copyright +holders: if needed, use a ‘Copyright’ field. +

+

The mandatory ‘License’ field in the DESCRIPTION file should +specify the license of the package in a standardized form. Alternatives +are indicated via vertical bars. Individual specifications must +be one of +

    +
  • One of the “standard” short specifications +
    +
    GPL-2 GPL-3 LGPL-2 LGPL-2.1 LGPL-3 AGPL-3 Artistic-2.0
    +BSD_2_clause BSD_3_clause MIT
    +
    +

    as made available via https://www.R-project.org/Licenses/ and +contained in subdirectory share/licenses of the R source or home +directory. +

  • The names or abbreviations of other licenses contained in the license +data base in file share/licenses/license.db in the R source or +home directory, possibly (for versioned licenses) followed by a version +restriction of the form ‘(op v)’ with ‘op’ one of +the comparison operators ‘<’, ‘<=’, ‘>’, ‘>=’, +‘==’, or ‘!=’ and ‘v’ a numeric version specification +(strings of non-negative integers separated by ‘.’), possibly +combined via,’ (see below for an example). For versioned +licenses, one can also specify the name followed by the version, or +combine an existing abbreviation and the version with a ‘-’. + +

    Abbreviations GPL and LGPL are ambiguous and usually taken +to mean any version of the license: but it is better not to use them. +

  • One of the strings ‘file LICENSE’ or ‘file LICENCE’ referring +to a file named LICENSE or LICENCE in the package (source +and installation) top-level directory. +
  • The string ‘Unlimited’, meaning that there are no restrictions on +distribution or use other than those imposed by relevant laws (including +copyright laws). +
+ +

If a package license restricts a base license (where permitted, +e.g., using GPL-3 or AGPL-3 with an attribution clause), the additional +terms should be placed in file LICENSE (or LICENCE), and +the string ‘+ file LICENSE’ (or ‘+ file LICENCE’, +respectively) should be appended to the corresponding individual license +specification. Note that several commonly used licenses do not permit +restrictions: this includes GPL-2 and hence any specification which +includes it. +

+

Examples of standardized specifications include +

+
License: GPL-2
+License: LGPL (>= 2.0, < 3) | Mozilla Public License
+License: GPL-2 | file LICENCE
+License: GPL (>= 2) | BSD_3_clause + file LICENSE
+License: Artistic-2.0 | AGPL-3 + file LICENSE
+
+

Please note in particular that “Public domain” is not a valid license, +since it is not recognized in some jurisdictions. +

+

Please ensure that the license you choose also covers any dependencies +(including system dependencies) of your package: it is particularly +important that any restrictions on the use of such dependencies are +evident to people reading your DESCRIPTION file. +

+

Fields ‘License_is_FOSS’ and ‘License_restricts_use’ may be +added by repositories where information cannot be computed from the name +of the license. ‘License_is_FOSS: yes’ is used for licenses which +are known to be FOSS, and ‘License_restricts_use’ can have values +‘yes’ or ‘no’ if the LICENSE file is known to restrict +users or usage, or known not to. These are used by, e.g., the +available.packages filters. +

+ + + +

The optional file LICENSE/LICENCE contains a copy of the +license of the package. To avoid any confusion only include such a file +if it is referred to in the ‘License’ field of the +DESCRIPTION file. +

+

Whereas you should feel free to include a license file in your +source distribution, please do not arrange to install yet +another copy of the GNU COPYING or COPYING.LIB +files but refer to the copies on +https://www.R-project.org/Licenses/ and included in the R +distribution (in directory share/licenses). Since files named +LICENSE or LICENCE will be installed, do not use +these names for standard license files. To include comments about the +licensing rather than the body of a license, use a file named something +like LICENSE.note. +

+

A few “standard” licenses are rather license templates which need +additional information to be completed via+ file LICENSE’. +

+
+ +
+

+Next: , Previous: , Up: Package structure   [Contents][Index]

+
+ +

1.1.3 Package Dependencies

+ +

The ‘Depends’ field gives a comma-separated list of package names +which this package depends on. Those packages will be attached before +the current package when library or require is called. +Each package name may be optionally followed by a comment in parentheses +specifying a version requirement. The comment should contain a +comparison operator, whitespace and a valid version number, +e.g. ‘MASS (>= 3.1-20)’. +

+

The ‘Depends’ field can also specify a dependence on a certain +version of R — e.g., if the package works only with R +version 3.0.0 or later, include ‘R (>= 3.0.0)’ in the +‘Depends’ field. You can also require a certain SVN revision for +R-devel or R-patched, e.g. ‘R (>= 2.14.0), R (>= r56550)’ +requires a version later than R-devel of late July 2011 (including +released versions of 2.14.0). +

+

It makes no sense to declare a dependence on R without a version +specification, nor on the package base: this is an R package +and package base is always available. +

+

A package or ‘R’ can appear more than once in the ‘Depends’ +field, for example to give upper and lower bounds on acceptable versions. +

+

Both library and the R package checking facilities use this +field: hence it is an error to use improper syntax or misuse the +‘Depends’ field for comments on other software that might be +needed. The R INSTALL facilities check if the version of +R used is recent enough for the package being installed, and the list +of packages which is specified will be attached (after checking version +requirements) before the current package. +

+

The ‘Imports’ field lists packages whose namespaces are imported +from (as specified in the NAMESPACE file) but which do not need +to be attached. Namespaces accessed by the ‘::’ and ‘:::’ +operators must be listed here, or in ‘Suggests’ or ‘Enhances’ +(see below). Ideally this field will include all the standard packages +that are used, and it is important to include S4-using packages (as +their class definitions can change and the DESCRIPTION file is +used to decide which packages to re-install when this happens). +Packages declared in the ‘Depends’ field should not also be in the +‘Imports’ field. Version requirements can be specified and are +checked when the namespace is loaded (since R >= 3.0.0). +

+

The ‘Suggests’ field uses the same syntax as ‘Depends’ and +lists packages that are not necessarily needed. This includes packages +used only in examples, tests or vignettes (see Writing package vignettes), and packages loaded in the body of functions. E.g., +suppose an example9 from +package foo uses a dataset from package bar. Then it is not +necessary to have bar use foo unless one wants to execute +all the examples/tests/vignettes: it is useful to have bar, but +not necessary. Version requirements can be specified, and will be used +by R CMD check. +

+

Finally, the ‘Enhances’ field lists packages “enhanced” by the +package at hand, e.g., by providing methods for classes from these +packages, or ways to handle objects from these packages (so several +packages have ‘Enhances: chron’ because they can handle datetime +objects from chron even though they prefer R’s native +datetime functions). Version requirements can be specified, but are +currently not used. Such packages cannot be required to check the +package: any tests which use them must be conditional on the presence +of the package. (If your tests use e.g. a dataset from another +package it should be in ‘Suggests’ and not ‘Enhances’.) +

+

The general rules are +

+
    +
  • A package should be listed in only one of these fields. +
  • Packages whose namespace only is needed to load the package using +library(pkgname) should be listed in the ‘Imports’ field +and not in the ‘Depends’ field. Packages listed in imports +or importFrom directives in the NAMESPACE file should +almost always be in ‘Imports’ and not ‘Depends’. +
  • Packages that need to be attached to successfully load the package using +library(pkgname) must be listed in the ‘Depends’ +field. +
  • All packages that are needed10 to successfully run R CMD check on the +package must be listed in one of ‘Depends’ or ‘Suggests’ or +‘Imports’. Packages used to run examples or tests conditionally +(e.g. via if(require(pkgname))) should be listed +in ‘Suggests’ or ‘Enhances’. (This allows checkers to ensure +that all the packages needed for a complete check are installed.) +
+ +

In particular, packages providing “only” data for examples or +vignettes should be listed in ‘Suggests’ rather than ‘Depends’ +in order to make lean installations possible. +

+

Version dependencies in the ‘Depends’ and ‘Imports’ fields are +used by library when it loads the package, and +install.packages checks versions for the ‘Depends’, +‘Imports’ and (for dependencies = TRUE) ‘Suggests’ +fields. +

+

It is increasingly important that the information in these fields is +complete and accurate: it is for example used to compute which packages +depend on an updated package and which packages can safely be installed +in parallel. +

+

This scheme was developed before all packages had namespaces (R +2.14.0 in October 2011), and good practice changed once that was in +place. +

+

Field ‘Depends’ should nowadays be used rarely, only for packages +which are intended to be put on the search path to make their facilities +available to the end user (and not to the package itself): for example +it makes sense that a user of package latticeExtra would want +the functions of package lattice made available. +

+

Almost always packages mentioned in ‘Depends’ should also be +imported from in the NAMESPACE file: this ensures that any needed +parts of those packages are available when some other package imports +the current package. +

+

The ‘Imports’ field should not contain packages which are not +imported from (via the NAMESPACE file or :: or +::: operators), as all the packages listed in that field need to +be installed for the current package to be installed. (This is checked +by R CMD check.) +

+

R code in the package should call library or require +only exceptionally. Such calls are never needed for packages listed in +‘Depends’ as they will already be on the search path. It used to +be common practice to use require calls for packages listed in +‘Suggests’ in functions which used their functionality, but +nowadays it is better to access such functionality via :: +calls. +

+

A package that wishes to make use of header files in other packages needs +to declare them as a comma-separated list in the field ‘LinkingTo’ +in the DESCRIPTION file. For example +

+
+
LinkingTo: link1, link2
+
+ +

As from R 3.0.2 the ‘LinkingTo’ field can have a version +requirement which is checked at installation. (In earlier versions of +R it would cause the specification to be ignored.) +

+

Specifying a package in ‘LinkingTo’ suffices if these are C++ +headers containing source code or static linking is done at +installation: the packages do not need to be (and usually should not be) +listed in the ‘Depends’ or ‘Imports’ fields. This includes +CRAN package BH and almost all users of +RcppArmadillo and RcppEigen. +

+

For another use of ‘LinkingTo’ see Linking to native routines in other packages. +

+

The ‘Additional_repositories’ field is a comma-separated list of +repository URLs where the packages named in the other fields may be +found. It is currently used by R CMD check to check that the +packages can be found, at least as source packages (which can be +installed on any platform). +

+ + + + +
+ + + +

1.1.3.1 Suggested packages

+ +

Note that someone wanting to run the examples/tests/vignettes may not +have a suggested package available (and it may not even be possible to +install it for that platform). The recommendation used to be to make +their use conditional via if(require("pkgname"))): +this is fine if that conditioning is done in examples/tests/vignettes. +

+

However, using require for conditioning in package code is +not good practice as it alters the search path for the rest of the +session and relies on functions in that package not being masked by +other require or library calls. It is better practice to +use code like +

+
   if (requireNamespace("rgl", quietly = TRUE)) {
+      rgl::plot3d(...)
+   } else {
+      ## do something else not involving rgl.
+   }
+
+

Note the use of rgl:: as that object would not necessarily be +visible (and if it is, it need not be the one from that namespace: +plot3d occurs in several other packages). If the intention is to +give an error if the suggested package is not available, simply use +e.g. rgl::plot3d. +

+

As noted above, packages in ‘Enhancesmust be used +conditionally and hence objects within them should always be accessed +via ::. +

+
+ + + +

1.1.4 The INDEX file

+ + +

The optional file INDEX contains a line for each sufficiently +interesting object in the package, giving its name and a description +(functions such as print methods not usually called explicitly might not +be included). Normally this file is missing and the corresponding +information is automatically generated from the documentation sources +(using tools::Rdindex()) when installing from source. +

+

The file is part of the information given by library(help = +pkgname). +

+

Rather than editing this file, it is preferable to put customized +information about the package into an overview help page +(see Documenting packages) and/or a vignette (see Writing package vignettes). +

+
+ + + +

1.1.5 Package subdirectories

+ + +

The R subdirectory contains R code files, only. The code +files to be installed must start with an ASCII (lower or upper +case) letter or digit and have one of the extensions11 .R, +.S, .q, .r, or .s. We recommend using +.R, as this extension seems to be not used by any other software. +It should be possible to read in the files using source(), so +R objects must be created by assignments. Note that there need be no +connection between the name of the file and the R objects created by +it. Ideally, the R code files should only directly assign R +objects and definitely should not call functions with side effects such +as require and options. If computations are required to +create objects these can use code ‘earlier’ in the package (see the +‘Collate’ field) plus functions in the ‘Depends’ packages +provided that the objects created do not depend on those packages except +via namespace imports. +

+

Two exceptions are allowed: if the R subdirectory contains a file +sysdata.rda (a saved image of one or more R objects: please +use suitable compression as suggested by tools::resaveRdaFiles, +and see also the ‘SysDataCompressionDESCRIPTION field.) +this will be lazy-loaded into the namespace environment – this is +intended for system datasets that are not intended to be user-accessible +via data. Also, files ending in ‘.in’ will be +allowed in the R directory to allow a configure script to +generate suitable files. +

+

Only ASCII characters (and the control characters tab, +formfeed, LF and CR) should be used in code files. Other characters are +accepted in comments12, but then the comments may not +be readable in e.g. a UTF-8 locale. Non-ASCII characters in +object names will normally13 fail when the package is installed. Any byte will +be allowed in a quoted character string but \uxxxx escapes should +be used for non-ASCII characters. However, +non-ASCII character strings may not be usable in some locales +and may display incorrectly in others. +

+ + +

Various R functions in a package can be used to initialize and +clean up. See Load hooks. +

+

The man subdirectory should contain (only) documentation files +for the objects in the package in R documentation (Rd) format. +The documentation filenames must start with an ASCII (lower or +upper case) letter or digit and have the extension .Rd (the +default) or .rd. Further, the names must be valid in +‘file://’ URLs, which means14 +they must be entirely ASCII and not contain ‘%’. +See Writing R documentation files, for more information. Note that +all user-level objects in a package should be documented; if a package +pkg contains user-level objects which are for “internal” use +only, it should provide a file pkg-internal.Rd which +documents all such objects, and clearly states that these are not meant +to be called by the user. See e.g. the sources for package grid +in the R distribution. Note that packages which use internal objects +extensively should not export those objects from their namespace, when +they do not need to be documented (see Package namespaces). +

+

Having a man directory containing no documentation files may give +an installation error. +

+

The man subdirectory may contain a subdirectory named macros; +this will contain source for user-defined Rd macros. +(See User-defined macros.) These use the Rd format, but may +not contain anything but macro definitions, comments and whitespace. +

+

The R and man subdirectories may contain OS-specific +subdirectories named unix or windows. +

+

The sources and headers for the compiled code are in src, plus +optionally a file Makevars or Makefile. When a package is +installed using R CMD INSTALL, make is used to control +compilation and linking into a shared object for loading into R. +There are default make variables and rules for this +(determined when R is configured and recorded in +R_HOME/etcR_ARCH/Makeconf), providing support for C, +C++, FORTRAN 77, Fortran 9x15, Objective C and Objective +C++16 with associated extensions .c, .cc or +.cpp, .f, .f90 or .f95, .m, and +.mm, respectively. We recommend using .h for headers, +also for C++17 or Fortran 9x include files. (Use of extension .C for +C++ is no longer supported.) Files in the src directory should +not be hidden (start with a dot), and hidden files will under some +versions of R be ignored. +

+

It is not portable (and may not be possible at all) to mix all these +languages in a single package, and we do not support using both C++ and +Fortran 9x. Because R itself uses it, we know that C and FORTRAN 77 +can be used together and mixing C and C++ seems to be widely successful. +

+

If your code needs to depend on the platform there are certain defines +which can used in C or C++. On all Windows builds (even 64-bit ones) +‘_WIN32’ will be defined: on 64-bit Windows builds also +‘_WIN64’, and on OS X ‘__APPLE__’ is defined.18 +

+

The default rules can be tweaked by setting macros19 in a file +src/Makevars (see Using Makevars). Note that this mechanism +should be general enough to eliminate the need for a package-specific +src/Makefile. If such a file is to be distributed, considerable +care is needed to make it general enough to work on all R platforms. +If it has any targets at all, it should have an appropriate first target +named ‘all’ and a (possibly empty) target ‘clean’ which +removes all files generated by running make (to be used by +‘R CMD INSTALL --clean’ and ‘R CMD INSTALL --preclean’). +There are platform-specific file names on Windows: +src/Makevars.win takes precedence over src/Makevars and +src/Makefile.win must be used. Some make programs +require makefiles to have a complete final line, including a newline. +

+

A few packages use the src directory for purposes other than +making a shared object (e.g. to create executables). Such packages +should have files src/Makefile and src/Makefile.win +(unless intended for only Unix-alikes or only Windows). +

+

In very special cases packages may create binary files other than the +shared objects/DLLs in the src directory. Such files will not be +installed in a multi-architecture setting since R CMD INSTALL +--libs-only is used to merge multiple sub-architectures and it only +copies shared objects/DLLs. If a package wants to install other +binaries (for example executable programs), it should provide an R +script src/install.libs.R which will be run as part of the +installation in the src build directory instead of copying +the shared objects/DLLs. The script is run in a separate R +environment containing the following variables: R_PACKAGE_NAME +(the name of the package), R_PACKAGE_SOURCE (the path to the +source directory of the package), R_PACKAGE_DIR (the path of the +target installation directory of the package), R_ARCH (the +arch-dependent part of the path, often empty), SHLIB_EXT (the +extension of shared objects) and WINDOWS (TRUE on Windows, +FALSE elsewhere). Something close to the default behavior could +be replicated with the following src/install.libs.R file: +

+
+
files <- Sys.glob(paste0("*", SHLIB_EXT))
+dest <- file.path(R_PACKAGE_DIR, paste0('libs', R_ARCH))
+dir.create(dest, recursive = TRUE, showWarnings = FALSE)
+file.copy(files, dest, overwrite = TRUE)
+if(file.exists("symbols.rds"))
+    file.copy("symbols.rds", dest, overwrite = TRUE)
+
+

On the other hand, executable programs could be installed along the +lines of +

+
execs <- c("one", "two", "three")
+if(WINDOWS) execs <- paste0(execs, ".exe")
+if ( any(file.exists(execs)) ) {
+  dest <- file.path(R_PACKAGE_DIR,  paste0('bin', R_ARCH))
+  dir.create(dest, recursive = TRUE, showWarnings = FALSE)
+  file.copy(execs, dest, overwrite = TRUE)
+}
+
+ +

Note the use of architecture-specific subdirectories of bin where +needed. +

+

The data subdirectory is for data files: See Data in packages. +

+

The demo subdirectory is for R scripts (for running via +demo()) that demonstrate some of the functionality of the +package. Demos may be interactive and are not checked automatically, so +if testing is desired use code in the tests directory to achieve +this. The script files must start with a (lower or upper case) letter +and have one of the extensions .R or .r. If present, the +demo subdirectory should also have a 00Index file with one +line for each demo, giving its name and a description separated by a +tab or at least three spaces. (This index file is not generated +automatically.) Note that a demo does not have a specified encoding and +so should be an ASCII file (see Encoding issues). As from +R 3.0.0 demo() will use the package encoding if there is one, +but this is mainly useful for non-ASCII comments. +

+ +

The contents of the inst subdirectory will be copied recursively +to the installation directory. Subdirectories of inst should not +interfere with those used by R (currently, R, data, +demo, exec, libs, man, help, +html and Meta, and earlier versions used latex, +R-ex). The copying of the inst happens after src +is built so its Makefile can create files to be installed. To +exclude files from being installed, one can specify a list of exclude +patterns in file .Rinstignore in the top-level source directory. +These patterns should be Perl-like regular expressions (see the help for +regexp in R for the precise details), one per line, to be +matched case-insensitively20 +against the file and directory paths, e.g. doc/.*[.]png$ will +exclude all PNG files in inst/doc based on the extension. +

+

Note that with the exceptions of INDEX, +LICENSE/LICENCE and NEWS, information files at the +top level of the package will not be installed and so not be +known to users of Windows and OS X compiled packages (and not seen +by those who use R CMD INSTALL or install.packages +on the tarball). So any information files you wish an end user to see +should be included in inst. Note that if the named exceptions +also occur in inst, the version in inst will be that seen +in the installed package. +

+ + + + +

Things you might like to add to inst are a CITATION file +for use by the citation function, and a NEWS.Rd file for +use by the news function. See its help page for the specific +format restrictions of the NEWS.Rd file. +

+ + +

Another file sometimes needed in inst is AUTHORS or +COPYRIGHTS to specify the authors or copyright holders when this +is too complex to put in the DESCRIPTION file. +

+

Subdirectory tests is for additional package-specific test code, +similar to the specific tests that come with the R distribution. +Test code can either be provided directly in a .R file, or +via a .Rin file containing code which in turn creates the +corresponding .R file (e.g., by collecting all function objects +in the package and then calling them with the strangest arguments). The +results of running a .R file are written to a .Rout file. +If there is a corresponding21 .Rout.save file, these two are +compared, with differences being reported but not causing an error. The +directory tests is copied to the check area, and the tests are +run with the copy as the working directory and with R_LIBS set to +ensure that the copy of the package installed during testing will be +found by library(pkg_name). Note that the package-specific +tests are run in a vanilla R session without setting the +random-number seed, so tests which use random numbers will need to set +the seed to obtain reproducible results (and it can be helpful to do so +in all cases, to avoid occasional failures when tests are run). +

+

If directory tests has a subdirectory Examples containing +a file pkg-Ex.Rout.save, this is compared to the output +file for running the examples when the latter are checked. Reference +output should be produced without having the --timings option +set (and note that --as-cran sets it). +

+

Subdirectory exec could contain additional executable scripts the +package needs, typically scripts for interpreters such as the shell, +Perl, or Tcl. NB: only files (and not directories) under exec are +installed (and those with names starting with a dot are ignored), and +they are all marked as executable (mode 755, moderated by +‘umask’) on POSIX platforms. Note too that this is not suitable +for executable programs since some platforms (including Windows) +support multiple architectures using the same installed package +directory. +

+

Subdirectory po is used for files related to localization: +see Internationalization. +

+

Subdirectory tools is the preferred place for auxiliary files +needed during configuration, and also for sources need to re-create +scripts (e.g. M4 files for autoconf). +

+ +
+ + + +

1.1.6 Data in packages

+ +

The data subdirectory is for data files, either to be made +available via lazy-loading or for loading using data(). +(The choice is made by the ‘LazyData’ field in the +DESCRIPTION file: the default is not to do so.) It should not be +used for other data files needed by the package, and the convention has +grown up to use directory inst/extdata for such files. +

+

Data files can have one of three types as indicated by their extension: +plain R code (.R or .r), tables (.tab, +.txt, or .csv, see ?data for the file formats, and +note that .csv is not the standard22 CSV format), or +save() images (.RData or .rda). The files should +not be hidden (have names starting with a dot). Note that R code +should be “self-sufficient” and not make use of extra functionality +provided by the package, so that the data file can also be used without +having to load the package or its namespace. +

+

Images (extensions .RData23 or .rda) can contain +references to the namespaces of packages that were used to create them. +Preferably there should be no such references in data files, and in any +case they should only be to packages listed in the Depends and +Imports fields, as otherwise it may be impossible to install the +package. To check for such references, load all the images into a +vanilla R session, and look at the output of +loadedNamespaces(). +

+

If your data files are large and you are not using ‘LazyData’ you +can speed up installation by providing a file datalist in the +data subdirectory. This should have one line per topic that +data() will find, in the format ‘foo’ if data(foo) +provides ‘foo’, or ‘foo: bar bah’ if data(foo) provides +‘bar’ and ‘bah’. R CMD build will automatically add +a datalist file to data directories of over 1Mb, using the +function tools::add_datalist. +

+

Tables (.tab, .txt, or .csv files) can be +compressed by gzip, bzip2 or xz, +optionally with additional extension .gz, .bz2 or +.xz. +

+

If your package is to be distributed, do consider the resource +implications of large datasets for your users: they can make packages +very slow to download and use up unwelcome amounts of storage space, as +well as taking many seconds to load. It is normally best to distribute +large datasets as .rda images prepared by save(, compress = +TRUE) (the default). Using bzip2 or xz compression +will usually reduce the size of both the package tarball and the +installed package, in some cases by a factor of two or more. +

+

Package tools has a couple of functions to help with data images: +checkRdaFiles reports on the way the image was saved, and +resaveRdaFiles will re-save with a different type of compression, +including choosing the best type for that particular image. +

+

Some packages using ‘LazyData’ will benefit from using a form of +compression other than gzip in the installed lazy-loading +database. This can be selected by the --data-compress option +to R CMD INSTALL or by using the ‘LazyDataCompression’ +field in the DESCRIPTION file. Useful values are bzip2, +xz and the default, gzip. The only way to discover which +is best is to try them all and look at the size of the +pkgname/data/Rdata.rdb file. +

+

Lazy-loading is not supported for very large datasets (those which when +serialized exceed 2GB, the limit for the format on 32-bit platforms and +all platforms prior to R 3.0.0). +

+

The analogue for sysdata.rda is field ‘SysDataCompression’: +the default (since R 2.12.2) is xz for files bigger than 1MB +otherwise gzip. +

+
+ + + +

1.1.7 Non-R scripts in packages

+ +

Code which needs to be compiled (C, C++, FORTRAN, Fortran 95 …) +is included in the src subdirectory and discussed elsewhere in +this document. +

+

Subdirectory exec could be used for scripts for interpreters such +as the shell, BUGS, JavaScript, Matlab, Perl, php (amap), +Python or Tcl (Simile), or even R. However, it seems more +common to use the inst directory, for example +WriteXLS/inst/Perl, NMF/inst/m-files, +RnavGraph/inst/tcl, RProtoBuf/inst/python and +emdbook/inst/BUGS and gridSVG/inst/js. +

+

Java code is a special case: except for very small programs, +.java files should be byte-compiled (to a .class file) and +distributed as part of a .jar file: the conventional location for +the .jar file(s) is inst/java. It is desirable (and +required under an Open Source license) to make the Java source files +available: this is best done in a top-level java directory in the +package—the source files should not be installed. +

+

If your package requires one of these interpreters or an extension then +this should be declared in the ‘SystemRequirements’ field of its +DESCRIPTION file. (Users of Java most often do so via +rJava, when depending on/importing that suffices.) +

+

Windows and Mac users should be aware that the Tcl extensions +‘BWidget’ and ‘Tktable’ which are currently included with the +R for Windows and in the OS X installers are extensions and do +need to be declared for users of other platforms (and that +‘Tktable’ is less widely available than it used to be, including +not in the main repositories for major Linux distributions). +

+

BWidget’ needs to be installed by the user on other OSes. This is +fairly easy to do: first find the Tcl/Tk search path: +

+
+
library(tcltk)
+strsplit(tclvalue('auto_path'), " ")[[1]]
+
+ +

then download the sources from +http://sourceforge.net/projects/tcllib/files/BWidget/ and +at the command line run something like +

+
+
tar xf bwidget-1.9.8.tar.gz
+sudo mv bwidget-1.9.8 /usr/local/lib
+
+ +

substituting a location on the Tcl/Tk search path for /usr/local/lib if +needed. +

+
+ + + +

1.1.8 Specifying URLs

+ +

URLs in many places in the package documentation will be converted to +clickable hyperlinks in at least some of their renderings. So care is +needed that their forms are correct and portable. +

+

The full URL should be given, including the scheme (often ‘http://’ +or ‘https://’) and a final ‘/’ for references to directories. +

+

Spaces in URLs are not portable and how they are handled does vary by +HTTP server and by client. There should be no space in the host part of +an ‘http://’ URL, and spaces in the remainder should be encoded, +with each space replaced by ‘%20’. +

+

Other characters may benefit from being encoded: see the help on +URLencode(). +

+

The canonical URL for a CRAN package is +

+
https://cran.r-project.org/package=pkgname
+
+ +

and not a version starting +‘http://cran.r-project.org/web/packages/pkgname’. +

+
+ + + +

1.2 Configure and cleanup

+ +

Note that most of this section is specific to Unix-alikes: see the +comments later on about the Windows port of R. +

+

If your package needs some system-dependent configuration before +installation you can include an executable (Bourne shell) script +configure in your package which (if present) is executed by +R CMD INSTALL before any other action is performed. This can be +a script created by the Autoconf mechanism, but may also be a script +written by yourself. Use this to detect if any nonstandard libraries +are present such that corresponding code in the package can be disabled +at install time rather than giving error messages when the package is +compiled or used. To summarize, the full power of Autoconf is available +for your extension package (including variable substitution, searching +for libraries, etc.). +

+

Under a Unix-alike only, an executable (Bourne shell) script +cleanup is executed as the last thing by R CMD INSTALL if +option --clean was given, and by R CMD build when +preparing the package for building from its source. +

+

As an example consider we want to use functionality provided by a (C or +FORTRAN) library foo. Using Autoconf, we can create a configure +script which checks for the library, sets variable HAVE_FOO to +TRUE if it was found and to FALSE otherwise, and then +substitutes this value into output files (by replacing instances of +‘@HAVE_FOO@’ in input files with the value of HAVE_FOO). +For example, if a function named bar is to be made available by +linking against library foo (i.e., using -lfoo), one +could use +

+
+
AC_CHECK_LIB(foo, fun, [HAVE_FOO=TRUE], [HAVE_FOO=FALSE])
+AC_SUBST(HAVE_FOO)
+......
+AC_CONFIG_FILES([foo.R])
+AC_OUTPUT
+
+ +

in configure.ac (assuming Autoconf 2.50 or later). +

+

The definition of the respective R function in foo.R.in could be +

+
+
foo <- function(x) {
+    if(!@HAVE_FOO@)
+      stop("Sorry, library ‘foo’ is not available"))
+    ...
+
+ +

From this file configure creates the actual R source file +foo.R looking like +

+
+
foo <- function(x) {
+    if(!FALSE)
+      stop("Sorry, library ‘foo’ is not available"))
+    ...
+
+ +

if library foo was not found (with the desired functionality). +In this case, the above R code effectively disables the function. +

+

One could also use different file fragments for available and missing +functionality, respectively. +

+

You will very likely need to ensure that the same C compiler and +compiler flags are used in the configure tests as when compiling +R or your package. Under a Unix-alike, you can achieve this by +including the following fragment early in configure.ac +(before calling AC_PROG_CC) +

+
+
: ${R_HOME=`R RHOME`}
+if test -z "${R_HOME}"; then
+  echo "could not determine R_HOME"
+  exit 1
+fi
+CC=`"${R_HOME}/bin/R" CMD config CC`
+CFLAGS=`"${R_HOME}/bin/R" CMD config CFLAGS`
+CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS`
+
+ +

(Using ‘${R_HOME}/bin/R’ rather than just ‘R’ is necessary +in order to use the correct version of R when running the script as +part of R CMD INSTALL, and the quotes since ‘${R_HOME}’ +might contain spaces.) +

+

If your code does load checks then you may also need +

+
LDFLAGS=`"${R_HOME}/bin/R" CMD config LDFLAGS`
+
+ +

and packages written with C++ need to pick up the details for the C++ +compiler and switch the current language to C++ by something like +

+
CXX=`"${R_HOME}/bin/R" CMD config CXX`
+CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
+AC_LANG(C++)
+
+ +

The latter is important, as for example C headers may not be available +to C++ programs or may not be written to avoid C++ name-mangling. +

+ +

You can use R CMD config for getting the value of the basic +configuration variables, and also the header and library flags necessary +for linking a front-end executable program against R, see R CMD +config --help for details. +

+

To check for an external BLAS library using the ACX_BLAS macro +from the official Autoconf Macro Archive, one can simply do +

+
+
F77=`"${R_HOME}/bin/R" CMD config F77`
+AC_PROG_F77
+FLIBS=`"${R_HOME}/bin/R" CMD config FLIBS`
+ACX_BLAS([], AC_MSG_ERROR([could not find your BLAS library], 1))
+
+ +

Note that FLIBS as determined by R must be used to ensure that +FORTRAN 77 code works on all R platforms. Calls to the Autoconf macro +AC_F77_LIBRARY_LDFLAGS, which would overwrite FLIBS, must +not be used (and hence e.g. removed from ACX_BLAS). (Recent +versions of Autoconf in fact allow an already set FLIBS to +override the test for the FORTRAN linker flags.) +

+ +

N.B.: If the configure script creates files, e.g. +src/Makevars, you do need a cleanup script to remove +them. Otherwise R CMD build may ship the files that are +created. For example, package RODBC has +

+
+
#!/bin/sh
+
+rm -f config.* src/Makevars src/config.h
+
+ +

As this example shows, configure often creates working files +such as config.log. +

+

If your configure script needs auxiliary files, it is recommended that +you ship them in a tools directory (as R itself does). +

+

You should bear in mind that the configure script will not be used on +Windows systems. If your package is to be made publicly available, +please give enough information for a user on a non-Unix-alike platform +to configure it manually, or provide a configure.win script to be +used on that platform. (Optionally, there can be a cleanup.win +script. Both should be shell scripts to be executed by ash, +which is a minimal version of Bourne-style sh.) When +configure.win is run the environment variables R_HOME +(which uses ‘/’ as the file separator), R_ARCH and Use +R_ARCH_BIN will be set. Use R_ARCH to decide if this is a +64-bit build (its value there is ‘/x64’) and to install DLLs to the +correct place (${R_HOME}/libs${R_ARCH}). Use +R_ARCH_BIN to find the correct place under the bin +directory, e.g. ${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe. +

+

In some rare circumstances, the configuration and cleanup scripts need +to know the location into which the package is being installed. An +example of this is a package that uses C code and creates two shared +object/DLLs. Usually, the object that is dynamically loaded by R +is linked against the second, dependent, object. On some systems, we +can add the location of this dependent object to the object that is +dynamically loaded by R. This means that each user does not have to +set the value of the LD_LIBRARY_PATH (or equivalent) environment +variable, but that the secondary object is automatically resolved. +Another example is when a package installs support files that are +required at run time, and their location is substituted into an R +data structure at installation time. (This happens with the Java Archive +files in the Omegahat SJava package.) + + + +The names of the top-level library directory (i.e., specifiable +via the ‘-l’ argument) and the directory of the package +itself are made available to the installation scripts via the two +shell/environment variables R_LIBRARY_DIR and R_PACKAGE_DIR. +Additionally, the name of the package (e.g. ‘survival’ or +‘MASS’) being installed is available from the environment variable +R_PACKAGE_NAME. (Currently the value of R_PACKAGE_DIR is +always ${R_LIBRARY_DIR}/${R_PACKAGE_NAME}, but this used not to +be the case when versioned installs were allowed. Its main use is in +configure.win scripts for the installation path of external +software’s DLLs.) Note that the value of R_PACKAGE_DIR may +contain spaces and other shell-unfriendly characters, and so should be +quoted in makefiles and configure scripts. +

+

One of the more tricky tasks can be to find the headers and libraries of +external software. One tool which is increasingly available on +Unix-alikes (but not by default on OS X) to do this is +pkg-config. The configure script will need to test for +the presence of the command itself (see for example package +Cairo), and if present it can be asked if the software is +installed, of a suitable version and for compilation/linking flags by +e.g. +

+
+
$ pkg-config --exists ‘QtCore >= 4.0.0’  # check the status
+$ pkg-config --modversion QtCore
+4.7.1
+$ pkg-config --cflags QtCore
+-DQT_SHARED -I/usr/include/QtCore
+$ pkg-config --libs QtCore
+-lQtCore
+
+ +

Note that pkg-config --libs gives the information +required to link against the default version of that library (usually +the dynamic one), and pkg-config --static is needed if the +static library is to be used. +

+

Sometimes the name by which the software is known to +pkg-config is not what one might expect (e.g. +‘gtk+-2.0’ even for 2.22). To get a complete list use +

+
+
pkg-config --list-all | sort
+
+ + + + + + + + +
+ + + +

1.2.1 Using Makevars

+ + + + + + + +

Sometimes writing your own configure script can be avoided by +supplying a file Makevars: also one of the most common uses of a +configure script is to make Makevars from +Makevars.in. +

+

A Makevars file is a makefile and is used as one of several +makefiles by R CMD SHLIB (which is called by R CMD +INSTALL to compile code in the src directory). It should be +written if at all possible in a portable style, in particular (except +for Makevars.win) without the use of GNU extensions. +

+

The most common use of a Makevars file is to set additional +preprocessor options (for example include paths) for C/C++ files +via PKG_CPPFLAGS, and additional compiler flags by setting +PKG_CFLAGS, PKG_CXXFLAGS, PKG_FFLAGS or +PKG_FCFLAGS, for C, C++, FORTRAN or Fortran 9x respectively +(see Creating shared objects). +

+

N.B.: Include paths are preprocessor options, not compiler +options, and must be set in PKG_CPPFLAGS as otherwise +platform-specific paths (e.g. ‘-I/usr/local/include’) will take +precedence. +

+

Makevars can also be used to set flags for the linker, for +example ‘-L’ and ‘-l’ options, via PKG_LIBS. +

+

When writing a Makevars file for a package you intend to +distribute, take care to ensure that it is not specific to your +compiler: flags such as -O2 -Wall -pedantic (and all other +-W flags: for the Solaris compiler these are used to pass +arguments to compiler phases) are all specific to GCC. +

+

Also, do not set variables such as CPPFLAGS, CFLAGS etc.: +these should be settable by users (sites) through appropriate personal +(site-wide) Makevars files. +See Customizing package compilation in R Installation and Administration, +

+

There are some macros24 which are set whilst configuring the +building of R itself and are stored in +R_HOME/etcR_ARCH/Makeconf. That makefile is included +as a Makefile after Makevars[.win], and the macros +it defines can be used in macro assignments and make command lines in +the latter. These include +

+
+
FLIBS
+
+

A macro containing the set of libraries need to link FORTRAN code. This +may need to be included in PKG_LIBS: it will normally be included +automatically if the package contains FORTRAN source files. +

+
+
BLAS_LIBS
+
+

A macro containing the BLAS libraries used when building R. This may +need to be included in PKG_LIBS. Beware that if it is empty then +the R executable will contain all the double-precision and +double-complex BLAS routines, but no single-precision nor complex +routines. If BLAS_LIBS is included, then FLIBS also needs +to be25 included following it, as most BLAS +libraries are written at least partially in FORTRAN. +

+
+
LAPACK_LIBS
+
+

A macro containing the LAPACK libraries (and paths where appropriate) +used when building R. This may need to be included in +PKG_LIBS. It may point to a dynamic library libRlapack +which contains the main double-precision LAPACK routines as well as +those double-complex LAPACK routines needed to build R, or it may +point to an external LAPACK library, or may be empty if an external BLAS +library also contains LAPACK. +

+

[libRlapack includes all the double-precision LAPACK routines +current in 2003: a list of which routines are included is in file +src/modules/lapack/README.] +

+

For portability, the macros BLAS_LIBS and FLIBS should +always be included after LAPACK_LIBS (and in that order). +

+
+
SAFE_FFLAGS
+
+

A macro containing flags which are needed to circumvent +over-optimization of FORTRAN code: it is typically ‘-g -O2 +-ffloat-store’ on ‘ix86’ platforms using gfortran. +Note that this is not an additional flag to be used as part of +PKG_FFLAGS, but a replacement for FFLAGS, and that it is +intended for the FORTRAN 77 compiler ‘F77’ and not necessarily for +the Fortran 90/95 compiler ‘FC’. See the example later in this +section. +

+
+ + +

Setting certain macros in Makevars will prevent R CMD +SHLIB setting them: in particular if Makevars sets +‘OBJECTS’ it will not be set on the make command line. +This can be useful in conjunction with implicit rules to allow other +types of source code to be compiled and included in the shared object. +It can also be used to control the set of files which are compiled, +either by excluding some files in src or including some files in +subdirectories. For example +

+
+
OBJECTS = 4dfp/endianio.o 4dfp/Getifh.o R4dfp-object.o
+
+ + +

Note that Makevars should not normally contain targets, as it is +included before the default makefile and make will call the +first target, intended to be all in the default makefile. If you +really need to circumvent that, use a suitable (phony) target all +before any actual targets in Makevars.[win]: for example package +fastICA used to have +

+
+
PKG_LIBS = @BLAS_LIBS@
+
+SLAMC_FFLAGS=$(R_XTRA_FFLAGS) $(FPICFLAGS) $(SHLIB_FFLAGS) $(SAFE_FFLAGS)
+
+all: $(SHLIB)
+
+slamc.o: slamc.f
+	$(F77) $(SLAMC_FFLAGS) -c -o slamc.o slamc.f
+
+ +

needed to ensure that the LAPACK routines find some constants without +infinite looping. The Windows equivalent was +

+
+
all: $(SHLIB)
+
+slamc.o: slamc.f
+	$(F77) $(SAFE_FFLAGS) -c -o slamc.o slamc.f
+
+ +

(since the other macros are all empty on that platform, and R’s +internal BLAS was not used). Note that the first target in +Makevars will be called, but for back-compatibility it is best +named all. +

+

If you want to create and then link to a library, say using code in a +subdirectory, use something like +

+
+
.PHONY: all mylibs
+
+all: $(SHLIB)
+$(SHLIB): mylibs
+
+mylibs:
+	(cd subdir; make)
+
+ +

Be careful to create all the necessary dependencies, as there is a no +guarantee that the dependencies of all will be run in a +particular order (and some of the CRAN build machines use +multiple CPUs and parallel makes). +

+

Note that on Windows it is required that Makevars[.win] does +create a DLL: this is needed as it is the only reliable way to ensure +that building a DLL succeeded. If you want to use the src +directory for some purpose other than building a DLL, use a +Makefile.win file. +

+

It is sometimes useful to have a target ‘clean’ in Makevars +or Makevars.win: this will be used by R CMD build to +clean up (a copy of) the package sources. When it is run by +build it will have fewer macros set, in particular not +$(SHLIB), nor $(OBJECTS) unless set in the file itself. +It would also be possible to add tasks to the target ‘shlib-clean’ +which is run by R CMD INSTALL and R CMD SHLIB with +options --clean and --preclean. +

+

If you want to run R code in Makevars, e.g. to find +configuration information, please do ensure that you use the correct +copy of R or Rscript: there might not be one in the path +at all, or it might be the wrong version or architecture. The correct +way to do this is via +

+
+
"$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" filename
+"$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e ‘R expression’
+
+ +

where $(R_ARCH_BIN) is only needed currently on Windows. +

+

Environment or make variables can be used to select different macros for +32- and 64-bit code, for example (GNU make syntax, allowed on +Windows) +

+
+
ifeq "$(WIN)" "64"
+PKG_LIBS = value for 64-bit Windows
+else
+PKG_LIBS = value for 32-bit Windows
+endif
+
+ +

On Windows there is normally a choice between linking to an import +library or directly to a DLL. Where possible, the latter is much more +reliable: import libraries are tied to a specific toolchain, and in +particular on 64-bit Windows two different conventions have been +commonly used. So for example instead of +

+
+
PKG_LIBS = -L$(XML_DIR)/lib -lxml2
+
+ +

one can use +

+
+
PKG_LIBS = -L$(XML_DIR)/bin -lxml2
+
+ +

since on Windows -lxxx will look in turn for +

+
+
libxxx.dll.a
+xxx.dll.a
+libxxx.a
+xxx.lib
+libxxx.dll
+xxx.dll
+
+ +

where the first and second are conventionally import libraries, the +third and fourth often static libraries (with .lib intended for +Visual C++), but might be import libraries. See for example +https://sourceware.org/binutils/docs-2.20/ld/WIN32.html#WIN32. +

+

The fly in the ointment is that the DLL might not be named +libxxx.dll, and in fact on 32-bit Windows there is a +libxml2.dll whereas on one build for 64-bit Windows the DLL is +called libxml2-2.dll. Using import libraries can cover over +these differences but can cause equal difficulties. +

+

If static libraries are available they can save a lot of problems with +run-time finding of DLLs, especially when binary packages are to be +distributed and even more when these support both architectures. Where +using DLLs is unavoidable we normally arrange (via +configure.win) to ship them in the same directory as the package +DLL. +

+
+ +
+

+Next: , Previous: , Up: Using Makevars   [Contents][Index]

+
+ +

1.2.1.1 OpenMP support

+ + + +

There is some support for packages which wish to use +OpenMP26. The +make macros +

+
+
SHLIB_OPENMP_CFLAGS
+SHLIB_OPENMP_CXXFLAGS
+SHLIB_OPENMP_FCFLAGS
+SHLIB_OPENMP_FFLAGS
+
+ +

are available for use in src/Makevars or src/Makevars.win. +Include the appropriate macro in PKG_CFLAGS, PKG_CPPFLAGS +and so on, and also in PKG_LIBS. C/C++ code that needs to be +conditioned on the use of OpenMP can be used inside #ifdef +_OPENMP: note that some toolchains used for R (including most of +those using clang27) have no OpenMP support at all, not even +omp.h. +

+

For example, a package with C code written for OpenMP should have in +src/Makevars the lines +

+
+
PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
+
+ +

Note that the macro SHLIB_OPENMP_CXXFLAGS applies to the C++98 +compiler and not necessarily to the C++11 compiler: users of the latter +should do their own configure checks. +

+

Some care is needed when compilers are from different families which may +use different OpenMP runtimes (e.g. clang vs GCC +including gfortran, although it is currently possible to use +the clang runtime with GCC but not vice versa). For a +package with Fortran 77 code using OpenMP the appropriate lines are +

+
+
PKG_FFLAGS = $(SHLIB_OPENMP_FFLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
+
+ +

as the C compiler will be used to link the package code (and there is no +guarantee that this will work everywhere). +

+ +

There is nothing to say what version of OpenMP is supported: version 3.0 +(May 2008) is supported by recent versions of the Linux, Windows and +Solaris platforms, but portable packages cannot assume that end users +have recent versions. OS X currently uses Apple builds of +clang with no OpenMP support. +

+

The performance of OpenMP varies substantially between platforms. Both +the Windows and earlier Apple OS X implementations have substantial +overheads and are only beneficial if quite substantial tasks are run in +parallel. Also, on Windows new threads are started with the +default28 FPU control +word, so computations done on OpenMP threads will not make use of +extended-precision arithmetic which is the default for the main process. +

+

Calling any of the R API from threaded code is ‘for experts only’: +they will need to read the source code to determine if it is +thread-safe. In particular, code which makes use of the stack-checking +mechanism must not be called from threaded code. +

+

Packages are not standard-alone programs, and an R process could +contain more than one OpenMP-enabled package as well as other components +(for example, an optimized BLAS) making use of OpenMP. So careful +consideration needs to be given to resource usage. OpenMP works with +parallel regions, and for most implementations the default is to use as +many threads as ‘CPUs’ for such regions. Parallel regions can be +nested, although it is common to use only a single thread below the +first level. The correctness of the detected number of ‘CPUs’ and the +assumption that the R process is entitled to use them all are both +dubious assumptions. The best way to limit resources is to limit the +overall number of threads available to OpenMP in the R process: this +can be done via environment variable OMP_THREAD_LIMIT, where +implemented.29 Alternatively, the +number of threads per region can be limited by the environment variable +OMP_NUM_THREADS or API call omp_set_num_threads, or, +better, for the regions in your code as part of their +specification. E.g. R uses +

+
#pragma omp parallel for num_threads(nthreads) …
+
+

That way you only control your own code and not that of other OpenMP users. +

+
+ + + +

1.2.1.2 Using pthreads

+ +

There is no direct support for the POSIX threads (more commonly known as +pthreads): by the time we considered adding it several packages +were using it unconditionally so it seems that nowadays it is +universally available on POSIX operating systems (hence not Windows). +

+

For reasonably recent versions of gcc and clang the +correct specification is +

+
+
PKG_CPPFLAGS = -pthread
+PKG_LIBS = -pthread
+
+ +

(and the plural version is also accepted on some systems/versions). For +other platforms the specification is +

+
+
PKG_CPPFLAGS = -D_REENTRANT
+PKG_LIBS = -lpthread
+
+

(and note that the library name is singular). This is what +-pthread does on all known current platforms (although earlier +versions of OpenBSD used a different library name). +

+

For a tutorial see +https://computing.llnl.gov/tutorials/pthreads/. +

+

POSIX threads are not normally used on Windows, which has its own native +concepts of threads. However, there are two projects implementing +pthreads on top of Windows, pthreads-w32 and +winpthreads (a recent part of the MinGW-w64 project). +

+

Whether Windows toolchains implement pthreads is up to the +toolchain provider: the currently recommended toolchain does by default +provide it. A make variable SHLIB_PTHREAD_FLAGS is +available: this should be included in both PKG_CPPFLAGS (or the +Fortran or F9x equivalents) and PKG_LIBS. +

+

The presence of a working pthreads implementation cannot be +unambiguously determined without testing for yourself: however, that +‘_REENTRANT’ is defined30 in C/C++ code is a good indication. +

+

See also the comments on thread-safety and performance under OpenMP: on +all known R platforms OpenMP is implemented via +pthreads and the known performance issues are in the latter. +

+
+ +
+

+Previous: , Up: Using Makevars   [Contents][Index]

+
+ +

1.2.1.3 Compiling in sub-directories

+ +

Package authors fairly often want to organize code in sub-directories of +src, for example if they are including a separate piece of +external software to which this is an R interface. +

+

One simple way is simply to set OBJECTS to be all the objects +that need to be compiled, including in sub-directories. For example, +CRAN package RSiena has +

+
+
SOURCES = $(wildcard data/*.cpp network/*.cpp utils/*.cpp model/*.cpp model/*/*.cpp model/*/*/*.cpp)
+
+OBJECTS = siena07utilities.o siena07internals.o siena07setup.o siena07models.o $(SOURCES:.cpp=.o)
+
+ +

One problem with that approach is that unless GNU make extensions are +used, the source files need to be listed and kept up-to-date. As in the +following from CRAN package lossDev: +

+
+
OBJECTS.samplers = samplers/ExpandableArray.o samplers/Knots.o \
+  samplers/RJumpSpline.o samplers/RJumpSplineFactory.o \
+  samplers/RealSlicerOV.o samplers/SliceFactoryOV.o samplers/MNorm.o
+OBJECTS.distributions = distributions/DSpline.o \
+  distributions/DChisqrOV.o distributions/DTOV.o \
+  distributions/DNormOV.o distributions/DUnifOV.o distributions/RScalarDist.o
+OBJECTS.root = RJump.o
+
+OBJECTS = $(OBJECTS.samplers) $(OBJECTS.distributions) $(OBJECTS.root)
+
+ +

Where the subdirectory is self-contained code with a suitable makefile, +the best approach is something like +

+
+
PKG_LIBS = -LCsdp/lib -lsdp $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
+
+$(SHLIB): Csdp/lib/libsdp.a
+
+Csdp/lib/libsdp.a
+	@(cd Csdp/lib && $(MAKE) libsdp.a \
+	  CC="$(CC)" CFLAGS="$(CFLAGS) $(CPICFLAGS)" AR="$(AR)" RANLIB="$(RANLIB)")
+
+ +

Note the quotes: the macros can contain spaces, e.g. CC = "gcc +-m64 -std=gnu99". Several authors have forgotten about parallel makes: +the static library in the subdirectory must be made before the shared +object ($(SHLIB)) and so the latter must depend on the former. +Others forget the need for position-independent code. +

+

We really do not recommend using src/Makefile instead of +src/Makevars, and as the example above shows, it is not +necessary. +

+
+ + + +

1.2.2 Configure example

+ +

It may be helpful to give an extended example of using a +configure script to create a src/Makevars file: this is +based on that in the RODBC package. +

+

The configure.ac file follows: configure is created from +this by running autoconf in the top-level package directory +(containing configure.ac). +

+
+
+
AC_INIT([RODBC], 1.1.8) dnl package name, version
+
+dnl A user-specifiable option
+odbc_mgr=""
+AC_ARG_WITH([odbc-manager],
+	    AC_HELP_STRING([--with-odbc-manager=MGR],
+			   [specify the ODBC manager, e.g. odbc or iodbc]),
+	    [odbc_mgr=$withval])
+
+if test "$odbc_mgr" = "odbc" ; then
+  AC_PATH_PROGS(ODBC_CONFIG, odbc_config)
+fi
+
+dnl Select an optional include path, from a configure option
+dnl or from an environment variable.
+AC_ARG_WITH([odbc-include],
+	    AC_HELP_STRING([--with-odbc-include=INCLUDE_PATH],
+			   [the location of ODBC header files]),
+	    [odbc_include_path=$withval])
+RODBC_CPPFLAGS="-I."
+if test [ -n "$odbc_include_path" ] ; then
+   RODBC_CPPFLAGS="-I. -I${odbc_include_path}"
+else
+  if test [ -n "${ODBC_INCLUDE}" ] ; then
+     RODBC_CPPFLAGS="-I. -I${ODBC_INCLUDE}"
+  fi
+fi
+
+dnl ditto for a library path
+AC_ARG_WITH([odbc-lib],
+	    AC_HELP_STRING([--with-odbc-lib=LIB_PATH],
+			   [the location of ODBC libraries]),
+	    [odbc_lib_path=$withval])
+if test [ -n "$odbc_lib_path" ] ; then
+   LIBS="-L$odbc_lib_path ${LIBS}"
+else
+  if test [ -n "${ODBC_LIBS}" ] ; then
+     LIBS="-L${ODBC_LIBS} ${LIBS}"
+  else
+    if test -n "${ODBC_CONFIG}"; then
+      odbc_lib_path=`odbc_config --libs | sed s/-lodbc//`
+      LIBS="${odbc_lib_path} ${LIBS}"
+    fi
+  fi
+fi
+
+dnl Now find the compiler and compiler flags to use
+: ${R_HOME=`R RHOME`}
+if test -z "${R_HOME}"; then
+  echo "could not determine R_HOME"
+  exit 1
+fi
+CC=`"${R_HOME}/bin/R" CMD config CC`
+CPP=`"${R_HOME}/bin/R" CMD config CPP`
+CFLAGS=`"${R_HOME}/bin/R" CMD config CFLAGS`
+CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS`
+AC_PROG_CC
+AC_PROG_CPP
+
+
+if test -n "${ODBC_CONFIG}"; then
+  RODBC_CPPFLAGS=`odbc_config --cflags`
+fi
+CPPFLAGS="${CPPFLAGS} ${RODBC_CPPFLAGS}"
+
+dnl Check the headers can be found
+AC_CHECK_HEADERS(sql.h sqlext.h)
+if test "${ac_cv_header_sql_h}" = no ||
+   test "${ac_cv_header_sqlext_h}" = no; then
+   AC_MSG_ERROR("ODBC headers sql.h and sqlext.h not found")
+fi
+
+dnl search for a library containing an ODBC function
+if test [ -n "${odbc_mgr}" ] ; then
+  AC_SEARCH_LIBS(SQLTables, ${odbc_mgr}, ,
+      AC_MSG_ERROR("ODBC driver manager ${odbc_mgr} not found"))
+else
+  AC_SEARCH_LIBS(SQLTables, odbc odbc32 iodbc, ,
+      AC_MSG_ERROR("no ODBC driver manager found"))
+fi
+
+dnl for 64-bit ODBC need SQL[U]LEN, and it is unclear where they are defined.
+AC_CHECK_TYPES([SQLLEN, SQLULEN], , , [# include <sql.h>])
+dnl for unixODBC header
+AC_CHECK_SIZEOF(long, 4)
+
+dnl substitute RODBC_CPPFLAGS and LIBS
+AC_SUBST(RODBC_CPPFLAGS)
+AC_SUBST(LIBS)
+AC_CONFIG_HEADERS([src/config.h])
+dnl and do substitution in the src/Makevars.in and src/config.h
+AC_CONFIG_FILES([src/Makevars])
+AC_OUTPUT
+
+
+ +

where src/Makevars.in would be simply +

+
+
+
PKG_CPPFLAGS = @RODBC_CPPFLAGS@
+PKG_LIBS = @LIBS@
+
+
+ +

A user can then be advised to specify the location of the ODBC driver +manager files by options like (lines broken for easier reading) +

+
+
R CMD INSTALL \
+  --configure-args='--with-odbc-include=/opt/local/include \
+  --with-odbc-lib=/opt/local/lib --with-odbc-manager=iodbc' \
+  RODBC
+
+ +

or by setting the environment variables ODBC_INCLUDE and +ODBC_LIBS. +

+
+ + + +

1.2.3 Using F95 code

+ +

R assumes that source files with extension .f are FORTRAN 77, +and passes them to the compiler specified by ‘F77’. On most but +not all platforms that compiler will accept Fortran 90/95 code: some +platforms have a separate Fortran 90/95 compiler and a few (by now quite +rare31) platforms have no Fortran +90/95 support. +

+

This means that portable packages need to be written in correct +FORTRAN 77, which will also be valid Fortran 95. See +https://developer.R-project.org/Portability.html for reference +resources. In particular, free source form F95 code is not +portable. +

+

On some systems an alternative F95 compiler is available: from the +gcc family this might be gfortran or g95. +Configuring R will try to find a compiler which (from its name) +appears to be a Fortran 90/95 compiler, and set it in macro ‘FC’. +Note that it does not check that such a compiler is fully (or even +partially) compliant with Fortran 90/95. Packages making use of Fortran +90/95 features should use file extension .f90 or .f95 for +the source files: the variable PKG_FCFLAGS specifies any special +flags to be used. There is no guarantee that compiled Fortran 90/95 +code can be mixed with any other type of compiled code, nor that a build +of R will have support for such packages. +

+

Some (but not) all compilers specified by the ‘FC’ macro will +accept Fortran 2003 or 2008 code: such code should still use file +extension .f90 or .f95. For platforms using +gfortran, you may need to include -std=f2003 or +-std=f2008 in PKG_FCFLAGS: the default is ‘GNU Fortran’, +Fortran 95 with non-standard extensions. The Solaris f95 +compiler ‘accepts some Fortran 2003 features’. +

+

Modern versions of Fortran support modules, whereby compiling one source +file creates a module file which is then included in others. (Module +files typically have a .mod extension: they do depend on the +compiler used and so should never be included in a package.) This +creates a dependence which make will not know about and often +causes installation with a parallel make to fail. Thus it is necessary +to add explicit dependencies to src/Makevars to tell +make the constraints on the order of compilation. For +example, if file iface.f90 creates a module ‘iface’ used by +files cmi.f90 and dmi.f90 then src/Makevars needs +to contain something like +

+
+
cmi.o dmi.o: iface.o
+
+ +
+ + + +

1.2.4 Using C++11 code

+ +

R can be built without a C++ compiler although one is available (but +not necessarily installed) on all known R platforms. For full +portability across platforms, all that can be assumed is approximate +support for the C++98 standard (the widely used g++ deviates +considerably from the standard). Some compilers have a concept of +‘C++03’ (‘essentially a bug fix’) or ‘C++ Technical Report 1’ (TR1), an +optional addition to the ‘C++03’ revision which was published in 2007. +A revised standard was published in 2011 and compilers with fairly +complete implementations are becoming available. C++11 added all of the +C99 features which are not otherwise implemented in C++, and C++ +compilers commonly accept C99 extensions to C++98. A minor update to +C++11 (sometimes known as C++14) was approved in August 2014. +

+

Since version 3.1.0, R has provided support for C++11 in packages in +addition to C++98. This support is not uniform across platforms as it +depends on the capabilities of the compiler (see below). When R is +configured, it will determine whether the C++ compiler supports C++11 +and which compiler flags, if any, are required to enable C++11 support. +For example, recent versions of g++ or clang++ +accept the compiler flag -std=c++11, and earlier versions +support a flag -std=c++0x, but the latter only provides partial +support for the C++11 standard. +

+

In order to use C++11 code in a package, the package’s Makevars +file (or Makevars.win on Windows) should include the line +

+
+
CXX_STD = CXX11
+
+ +

Compilation and linking will then be done with the C++11 compiler. If +any other value is given to the ‘CXX_STD’ macro it will be ignored. +(Further options may become available in the future as the C++ standard +evolves.) +

+

Packages without a Makevars file may specify that they require +C++11 by including ‘C++11’ in the ‘SystemRequirements’ field +of the DESCRIPTION file, e.g. +

+
+
SystemRequirements: C++11
+
+ +

If a package does have a Makevars[.win] file then setting the +make variable ‘CXX_STD’ is preferred, as it allows R CMD +SHLIB to work correctly in the package’s src directory. +

+

The C++11 compiler will be used systematically by R for all C++ code +if the environment variable USE_CXX1X is defined (with any +value). Hence this environment variable should be defined when invoking +R CMD SHLIB in the absence of a Makevars file (or +Makevars.win on Windows) if a C++11 compiler is required. +

+

Further control over compilation of C++11 code can be obtained by +specifying the macros ‘CXX1X’ and ‘CXX1XSTD’ when R is +configured32, or in a personal or site Makevars file. +See Customizing package compilation in R Installation and Administration. +If C++11 support is not available then these macros are both +empty. Otherwise, ‘CXX1X’ defaults to the same value as the C++ +compiler ‘CXX’ and the flag ‘CXX1XSTD’ defaults to +-std=c++11 or -std=c++0x (the latter on Windows). It +is possible to specify ‘CXX1X’ to be a distinct compiler just for +C++11–using packages, e.g. g++ on Solaris. Note however +that different C++ compilers (and even different versions of the same +compiler) often differ in their ABI so their outputs can rarely be +mixed. By setting ‘CXX1XSTD’ it is also possible to choose a +different dialect of the standard, such as -std=gnu++11, or +enable support for the 2014 revision using something like +-std=c++14 or -std=c++1y. +

+

As noted above, support for C++11 varies across platforms. The default +compiler on Windows is GCC 4.6.x and supports the -std=c++0x +flag and some C++11 features (see +https://gcc.gnu.org/gcc-4.6/cxx0x_status.html). On some +platforms, it may be possible or necessary to select a different +compiler for C++11, via personal or site Makevars files. +

+

There is no guarantee that C++11 can be used in a package in combination +with any other compiled language (even C), as the C++11 compiler may be +incompatible with the native compilers for the platform. (There are +known problems mixing C++11 with Fortran.) +

+

If a package using C++11 has a configure script it is +essential that it selects the correct compiler, via something like +

+
+
CXX1X=`"${R_HOME}/bin/R" CMD config CXX11X`
+CXX1XSTD=`"${R_HOME}/bin/R" CMD config CXX11XSTD`
+CXX="$(CXX1X) $(CXX1XSTD)"
+CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXX11XFLAGS`
+AC_LANG(C++)
+
+ +

(paying attention to all the quotes required). +

+
+ + + +

1.3 Checking and building packages

+ +

Before using these tools, please check that your package can be +installed (which checked it can be loaded). R CMD check will +inter alia do this, but you may get more detailed error messages +doing the install directly. +

+ + + + + + +

If your package specifies an encoding in its DESCRIPTION file, +you should run these tools in a locale which makes use of that encoding: +they may not work at all or may work incorrectly in other locales +(although UTF-8 locales will most likely work). +

+
+

Note: R CMD check and R CMD build run R processes with +--vanilla in which none of the user’s startup files are read. +If you need R_LIBS set (to find packages in a non-standard +library) you can set it in the environment: also you can use the check +and build environment files (as specified by the environment variables +R_CHECK_ENVIRON and R_BUILD_ENVIRON; if unset, +files33 ~/.R/check.Renviron and +~/.R/build.Renviron are used) to set environment variables when +using these utilities. +

+ +
+

Note to Windows users: R CMD build may make use of the Windows toolset (see the “R +Installation and Administration” manual) if present and in your path, +and it is required for packages which need it to install (including +those with configure.win or cleanup.win scripts or a +src directory) and e.g. need vignettes built. +

+

You may need to set the environment variable TMPDIR to point to a +suitable writable directory with a path not containing spaces – use +forward slashes for the separators. Also, the directory needs to be on +a case-honouring file system (some network-mounted file systems are +not). +

+ + +
+ + + +

1.3.1 Checking packages

+ + + +

Using R CMD check, the R package checker, one can test whether +source R packages work correctly. It can be run on one or +more directories, or compressed package tar archives with +extension .tar.gz, .tgz, .tar.bz2 or +.tar.xz. +

+

It is strongly recommended that the final checks are run on a +tar archive prepared by R CMD build. +

+

This runs a series of checks, including +

+
    +
  1. The package is installed. This will warn about missing cross-references +and duplicate aliases in help files. + +
  2. The file names are checked to be valid across file systems and supported +operating system platforms. + +
  3. The files and directories are checked for sufficient permissions +(Unix-alikes only). + +
  4. The files are checked for binary executables, using a suitable version +of file if available34. (There may be +rare false positives.) + +
  5. The DESCRIPTION file is checked for completeness, and some of its +entries for correctness. Unless installation tests are skipped, +checking is aborted if the package dependencies cannot be resolved at +run time. (You may need to set R_LIBS in the environment if +dependent packages are in a separate library tree.) One check is that +the package name is not that of a standard package, nor one of the +defunct standard packages (‘ctest’, ‘eda’, ‘lqs’, +‘mle’, ‘modreg’, ‘mva’, ‘nls’, ‘stepfun’ and +‘ts’). Another check is that all packages mentioned in +library or requires or from which the NAMESPACE +file imports or are called via :: or ::: are listed +(in ‘Depends’, ‘Imports’, ‘Suggests’): this is not an +exhaustive check of the actual imports. + +
  6. Available index information (in particular, for demos and vignettes) is +checked for completeness. + +
  7. The package subdirectories are checked for suitable file names and for +not being empty. The checks on file names are controlled by the option +--check-subdirs=value. This defaults to ‘default’, +which runs the checks only if checking a tarball: the default can be +overridden by specifying the value as ‘yes’ or ‘no’. Further, +the check on the src directory is only run if the package +does not contain a configure script (which corresponds to the +value ‘yes-maybe’) and there is no src/Makefile or +src/Makefile.in. + +

    To allow a configure script to generate suitable files, files +ending in ‘.in’ will be allowed in the R directory. +

    +

    A warning is given for directory names that look like R package check +directories – many packages have been submitted to CRAN +containing these. +

    +
  8. The R files are checked for syntax errors. Bytes which are +non-ASCII are reported as warnings, but these should be +regarded as errors unless it is known that the package will always be +used in the same locale. + +
  9. It is checked that the package can be loaded, first with the usual +default packages and then only with package base already +loaded. It is checked that the namespace this can be loaded in an empty +session with only the base namespace loaded. (Namespaces and +packages can be loaded very early in the session, before the default +packages are available, so packages should work then.) + +
  10. The R files are checked for correct calls to library.dynam. +Package startup functions are checked for correct argument lists and +(incorrect) calls to functions which modify the search path or +inappropriately generate messages. The R code is checked for +possible problems using codetools. In addition, it is checked +whether S3 methods have all arguments of the corresponding generic, and +whether the final argument of replacement functions is called +‘value’. All foreign function calls (.C, .Fortran, +.Call and .External calls) are tested to see if they have +a PACKAGE argument, and if not, whether the appropriate DLL might +be deduced from the namespace of the package. Any other calls are +reported. (The check is generous, and users may want to supplement this +by examining the output of tools::checkFF("mypkg", verbose=TRUE), +especially if the intention were to always use a PACKAGE +argument) + +
  11. The Rd files are checked for correct syntax and metadata, +including the presence of the mandatory fields (\name, \alias, +\title and \description). The Rd name and +title are checked for being non-empty, and there is a check for missing +cross-references (links). + +
  12. A check is made for missing documentation entries, such as undocumented +user-level objects in the package. + +
  13. Documentation for functions, data sets, and S4 classes is checked for +consistency with the corresponding code. + +
  14. It is checked whether all function arguments given in \usage +sections of Rd files are documented in the corresponding +\arguments section. + +
  15. The data directory is checked for non-ASCII characters +and for the use of reasonable levels of compression. + +
  16. C, C++ and FORTRAN source and header files35 are +tested for portable (LF-only) line endings. If there is a +Makefile or Makefile.in or Makevars or +Makevars.in file under the src directory, it is checked +for portable line endings and the correct use of ‘$(BLAS_LIBS)’ and +‘$(LAPACK_LIBS)’ + +

    Compiled code is checked for symbols corresponding to functions which +might terminate R or write to stdout/stderr instead of +the console. Note that the latter might give false positives in that +the symbols might be pulled in with external libraries and could never +be called. Windows36 users +should note that the Fortran and C++ runtime libraries are examples of +such external libraries. +

    +
  17. Some checks are made of the contents of the inst/doc directory. +These always include checking for files that look like leftovers, and if +suitable tools (such as qpdf) are available, checking that the +PDF documentation is of minimal size. + +
  18. The examples provided by the package’s documentation are run. +(see Writing R documentation files, for information on using +\examples to create executable example code.) If there is a file +tests/Examples/pkg-Ex.Rout.save, the output of running the +examples is compared to that file. + +

    Of course, released packages should be able to run at least their own +examples. Each example is run in a ‘clean’ environment (so earlier +examples cannot be assumed to have been run), and with the variables +T and F redefined to generate an error unless they are set +in the example: See Logical vectors in An +Introduction to R. +

    +
  19. If the package sources contain a tests directory then the tests +specified in that directory are run. (Typically they will consist of a +set of .R source files and target output files +.Rout.save.) Please note that the comparison will be done in the +end user’s locale, so the target output files should be ASCII +if at all possible. (The command line option --test-dir=foo may +be used to specify tests in a non-standard location. For example, +unusually slow tests could be placed in inst/slowTests and then +R CMD check --test-dir=inst/slowTests would be used to run them. +Other names that have been suggested are, for example, +inst/testWithOracle for tests that require Oracle to be installed, +inst/randomTests for tests which use random values and may +occasionally fail by chance, etc.) + +
  20. The code in package vignettes (see Writing package vignettes) is +executed, and the vignette PDFs re-made from their sources as a check of +completeness of the sources (unless there is a ‘BuildVignettes’ +field in the package’s DESCRIPTION file with a false value). If +there is a target output file .Rout.save in the vignette source +directory, the output from running the code in that vignette is compared +with the target output file and any differences are reported (but not +recorded in the log file). (If the vignette sources are in the +deprecated location inst/doc, do mark such target output files to +not be installed in .Rinstignore.) + +

    If there is an error37 in executing the R code in vignette foo.ext, a log +file foo.ext.log is created in the check directory. The +vignette PDFs are re-made in a copy of the package sources in the +vign_test subdirectory of the check directory, so for further +information on errors look in directory +pkgname/vign_test/vignettes. (It is only retained if there +are errors or if environment variable _R_CHECK_CLEAN_VIGN_TEST_ is +set to a false value.) +

    +
  21. The PDF version of the package’s manual is created (to check that the +Rd files can be converted successfully). This needs LaTeX and +suitable fonts and LaTeX packages to be installed. +See Making the manuals in R Installation and Administration. + +
+ +

All these tests are run with collation set to the C locale, and +for the examples and tests with environment variable LANGUAGE=en: +this is to minimize differences between platforms. +

+

Use R CMD check --help to obtain more information about the usage +of the R package checker. A subset of the checking steps can be +selected by adding command-line options. It also allows customization by +setting environment variables _R_CHECK_*_:, as described in +Tools in R Internals: +a set of these customizations similar to those used by CRAN +can be selected by the option --as-cran (which works best if +Internet access is available). Some Windows users may +need to set environment variable R_WIN_NO_JUNCTIONS to a non-empty +value. The test of cyclic declarations38in DESCRIPTION files needs +repositories (including CRAN) set: do this in +~/.Rprofile, by e.g. +

+
options(repos = c(CRAN="https://cran.r-project.org"))
+
+ +

One check customization which can be revealing is +

+
_R_CHECK_CODETOOLS_PROFILE_="suppressLocalUnused=FALSE"
+
+

which reports unused local assignments. Not only does this point out +computations which are unnecessary because their results are unused, it +also can show errors. (Two such are to intend to update an object by +assigning a value but mistype its name or assign in the wrong scope, +for example using <- where <<- was intended.) This can +give false positives, most commonly because of non-standard evaluation +for formulae and because the intention is to return objects in the +environment of a function for later use. +

+

Complete checking of a package which contains a file README.md +needs pandoc installed: see +http://johnmacfarlane.net/pandoc/installing.html. This +should be reasonably current: CRAN used version 1.12.4.1 to +process these files at the time of writing +

+

You do need to ensure that the package is checked in a suitable locale +if it contains non-ASCII characters. Such packages are likely +to fail some of the checks in a C locale, and R CMD +check will warn if it spots the problem. You should be able to check +any package in a UTF-8 locale (if one is available). Beware that +although a C locale is rarely used at a console, it may be the +default if logging in remotely or for batch jobs. +

+
+

Multiple sub-architectures: On systems which support multiple sub-architectures (principally +Windows), R CMD check will install and check a package which +contains compiled code under all available sub-architectures. (Use +option --force-multiarch to force this for packages without +compiled code, which are otherwise only checked under the main +sub-architecture.) This will run the loading tests, examples and +tests directory under each installed sub-architecture in turn, +and give an error if any fail. Where environment variables (including +perhaps PATH) need to be set differently for each +sub-architecture, these can be set in architecture-specific files such +as R_HOME/etc/i386/Renviron.site. +

+

An alternative approach is to use R CMD check --no-multiarch +to check the primary sub-architecture, and then to use something like +R --arch=x86_64 CMD check --extra-arch or (Windows) +/path/to/R/bin/x64/Rcmd check --extra-arch to run for each +additional sub-architecture just the checks39 which differ by sub-architecture. (This +approach is required for packages which are installed by R CMD +INSTALL --merge-multiarch.) +

+

Where packages need additional commands to install all the +sub-architectures these can be supplied by e.g. +--install-args=--force-biarch. +

+
+ + +
+ + + +

1.3.2 Building package tarballs

+ + + + + +

Packages may be distributed in source form as “tarballs” +(.tar.gz files) or in binary form. The source form can be +installed on all platforms with suitable tools and is the usual form for +Unix-like systems; the binary form is platform-specific, and is the more +common distribution form for the Windows and OS X platforms. +

+

Using R CMD build, the R package builder, one can build R +package tarballs from their sources (for example, for subsequent release). +

+

Prior to actually building the package in the standard gzipped tar file +format, a few diagnostic checks and cleanups are performed. In +particular, it is tested whether object indices exist and can be assumed +to be up-to-date, and C, C++ and FORTRAN source files and relevant +makefiles in a src directory are tested and converted to LF +line-endings if necessary. +

+

Run-time checks whether the package works correctly should be performed +using R CMD check prior to invoking the final build procedure. +

+ +

To exclude files from being put into the package, one can specify a list +of exclude patterns in file .Rbuildignore in the top-level source +directory. These patterns should be Perl-like regular expressions (see +the help for regexp in R for the precise details), one per +line, to be matched case-insensitively40 against the file and directory names relative to the +top-level package source directory. In addition, directories from +source control systems41 or from eclipse42, directories with names ending .Rcheck or +Old or old and files GNUMakefile43, Read-and-delete-me +or with base names starting with ‘.#’, or starting and ending with +‘#’, or ending in ‘~’, ‘.bak’ or ‘.swp’, are +excluded by default. In addition, those files in the R, +demo and man directories which are flagged by R +CMD check as having invalid names will be excluded. +

+

Use R CMD build --help to obtain more information about the usage +of the R package builder. +

+

Unless R CMD build is invoked with the +--no-build-vignettes option (or the package’s +DESCRIPTION contains ‘BuildVignettes: no’ or similar), it +will attempt to (re)build the vignettes (see Writing package vignettes) in the package. To do so it installs the current package +into a temporary library tree, but any dependent packages need to be +installed in an available library tree (see the Note: at the top of this +section). +

+

Similarly, if the .Rd documentation files contain any +\Sexpr macros (see Dynamic pages), the package will be +temporarily installed to execute them. Post-execution binary copies of +those pages containing build-time macros will be saved in +build/partial.rdb. If there are any install-time or render-time +macros, a .pdf version of the package manual will be built and +installed in the build subdirectory. (This allows +CRAN or other repositories to display the manual even if they +are unable to install the package.) This can be suppressed by the +option --no-manual or if package’s DESCRIPTION contains +‘BuildManual: no’ or similar. +

+

One of the checks that R CMD build runs is for empty source +directories. These are in most (but not all) cases unintentional, if +they are intentional use the option --keep-empty-dirs (or set +the environment variable _R_BUILD_KEEP_EMPTY_DIRS_ to ‘TRUE’, +or have a ‘BuildKeepEmpty’ field with a true value in the +DESCRIPTION file). +

+

The --resave-data option allows saved images (.rda and +.RData files) in the data directory to be optimized for +size. It will also compress tabular files and convert .R files +to saved images. It can take values no, gzip (the default +if this option is not supplied, which can be changed by setting the +environment variable _R_BUILD_RESAVE_DATA_) and best +(equivalent to giving it without a value), which chooses the most +effective compression. Using best adds a dependence on R +(>= 2.10) to the DESCRIPTION file if bzip2 or +xz compression is selected for any of the files. If this is +thought undesirable, --resave-data=gzip (which is the default +if that option is not supplied) will do what compression it can with +gzip. A package can control how its data is resaved by +supplying a ‘BuildResaveData’ field (with one of the values given +earlier in this paragraph) in its DESCRIPTION file. +

+

The --compact-vignettes option will run +tools::compactPDF over the PDF files in inst/doc (and its +subdirectories) to losslessly compress them. This is not enabled by +default (it can be selected by environment variable +_R_BUILD_COMPACT_VIGNETTES_) and needs qpdf +(http://qpdf.sourceforge.net/) to be available. +

+

It can be useful to run R CMD check --check-subdirs=yes on the +built tarball as a final check on the contents. +

+

Where a non-POSIX file system is in use which does not utilize execute +permissions, some care is needed with permissions. This applies on +Windows and to e.g. FAT-formatted drives and SMB-mounted file systems +on other OSes. The ‘mode’ of the file recorded in the tarball will be +whatever file.info() returns. On Windows this will record only +directories as having execute permission and on other OSes it is likely +that all files have reported ‘mode’ 0777. A particular issue is +packages being built on Windows which are intended to contain executable +scripts such as configure and cleanup: R CMD +build ensures those two are recorded with execute permission. +

+

Directory build of the package sources is reserved for use by +R CMD build: it contains information which may not easily be +created when the package is installed, including index information on +the vignettes and, rarely, information on the help pages and perhaps a +copy of the PDF reference manual (see above). +

+
+ + + +

1.3.3 Building binary packages

+ + +

Binary packages are compressed copies of installed versions of +packages. They contain compiled shared libraries rather than C, C++ or +Fortran source code, and the R functions are included in their installed +form. The format and filename are platform-specific; for example, a +binary package for Windows is usually supplied as a .zip file, +and for the OS X platform the default binary package file extension is +.tgz. +

+

The recommended method of building binary packages is to use +

+

R CMD INSTALL --build pkg +where pkg is either the name of a source tarball (in the usual +.tar.gz format) or the location of the directory of the package +source to be built. This operates by first installing the package and +then packing the installed binaries into the appropriate binary package +file for the particular platform. +

+

By default, R CMD INSTALL --build will attempt to install the +package into the default library tree for the local installation of +R. This has two implications: +

+
    +
  • If the installation is successful, it will overwrite any existing installation +of the same package. + +
  • The default library tree must have write permission; if not, the package will +not install and the binary will not be created. + +
+ +

To prevent changes to the present working installation or to provide an +install location with write access, create a suitably located directory +with write access and use the -l option to build the package +in the chosen location. The usage is then +

+

R CMD INSTALL -l location --build pkg +

+

where location is the chosen directory with write access. The package +will be installed as a subdirectory of location, and the package binary +will be created in the current directory. +

+

Other options for R CMD INSTALL can be found using R +CMD INSTALL --help, and platform-specific details for special cases are +discussed in the platform-specific FAQs. +

+ + + +

Finally, at least one web-based service is available for building binary +packages from (checked) source code: WinBuilder (see +http://win-builder.R-project.org/) is able to build Windows +binaries. Note that this is intended for developers on other platforms +who do not have access to Windows but wish to provide binaries for the +Windows platform. +

+
+ + + +

1.4 Writing package vignettes

+ + + + + + + + +

In addition to the help files in Rd format, R packages allow +the inclusion of documents in arbitrary other formats. The standard +location for these is subdirectory inst/doc of a source package, +the contents will be copied to subdirectory doc when the package +is installed. Pointers from package help indices to the installed +documents are automatically created. Documents in inst/doc can +be in arbitrary format, however we strongly recommend providing them in +PDF format, so users on almost all platforms can easily read them. To +ensure that they can be accessed from a browser (as an HTML index is +provided), the file names should start with an ASCII letter +and be comprised entirely of ASCII letters or digits or hyphen +or underscore. +

+

A special case is package vignettes. Vignettes are documents in +PDF or HTML format obtained from plain text literate source files +from which R knows how to extract R code and create output (in +PDF/HTML or intermediate (La)TeX). Vignette engines do this work, +using “tangle” and “weave” functions respectively. Sweave, provided +by the R distribution, is the default engine. Since R version 3.0.0, +other vignette engines besides Sweave are supported; see Non-Sweave vignettes. +

+

Package vignettes have their sources in subdirectory vignettes of +the package sources. Note that the location of the vignette sources +only affects R CMD build and R CMD check: the +tarball built by R CMD build includes in inst/doc the +components intended to be installed. +

+

Sweave vignette sources are normally given the file extension +.Rnw or .Rtex, but for historical reasons +extensions44 .Snw and +.Stex are also recognized. Sweave allows the integration of +LaTeX documents: see the Sweave help page in R and the +Sweave vignette in package utils for details on the +source document format. +

+

Package vignettes are tested by R CMD check by executing all R +code chunks they contain (except those marked for non-evaluation, e.g., +with option eval=FALSE for Sweave). The R working directory +for all vignette tests in R CMD check is a copy of the +vignette source directory. Make sure all files needed to run the R +code in the vignette (data sets, …) are accessible by either +placing them in the inst/doc hierarchy of the source package or +by using calls to system.file(). All other files needed to +re-make the vignettes (such as LaTeX style files, BibTeX input +files and files for any figures not created by running the code in the +vignette) must be in the vignette source directory. +

+

R CMD build will automatically45 create the +(PDF or HTML versions of the) vignettes in inst/doc for +distribution with the package sources. By including the vignette +outputs in the package sources it is not necessary that these can be +re-built at install time, i.e., the package author can use private R +packages, screen snapshots and LaTeX extensions which are only +available on his machine.46 +

+

By default R CMD build will run Sweave on all Sweave +vignette source files in vignettes. If Makefile is found +in the vignette source directory, then R CMD build will try to +run make after the Sweave runs, otherwise +texi2pdf is run on each .tex file produced. +

+

The first target in the Makefile should take care of both +creation of PDF/HTML files and cleaning up afterwards (including +after Sweave), i.e., delete all files that shall not appear in +the final package archive. Note that if the make step runs R +it needs to be careful to respect the environment values of R_LIBS +and R_HOME47. +Finally, if there is a Makefile and it has a ‘clean:’ +target, make clean is run. +

+

All the usual caveats about including a Makefile apply. +It must be portable (no GNU extensions), use LF line endings +and must work correctly with a parallel make: too many authors +have written things like +

+
+
## BAD EXAMPLE
+all: pdf clean
+
+pdf: ABC-intro.pdf ABC-details.pdf
+
+%.pdf:  %.tex
+	texi2dvi --pdf $*
+
+clean:
+	rm *.tex ABC-details-*.pdf
+
+ +

which will start removing the source files whilst pdflatex is +working. +

+

Metadata lines can be placed in the source file, preferably in LaTeX +comments in the preamble. One such is a \VignetteIndexEntry of +the form +

+
%\VignetteIndexEntry{Using Animal}
+
+

Others you may see are \VignettePackage (currently ignored), +\VignetteDepends and \VignetteKeyword (which replaced +\VignetteKeywords). These are processed at package installation +time to create the saved data frame Meta/vignette.rds, but only +the \VignetteIndexEntry and \VignetteKeyword statements +are currently used. The \VignetteEngine statement +is described in Non-Sweave vignettes. +

+

At install time an HTML index for all vignettes in the package is +automatically created from the \VignetteIndexEntry statements +unless a file index.html exists in directory +inst/doc. This index is linked from the HTML help index for +the package. If you do supply a inst/doc/index.html file it +should contain relative links only to files under the installed +doc directory, or perhaps (not really an index) to HTML help +files or to the DESCRIPTION file, and be valid HTML as +confirmed via the W3C Markup Validation +Service or Validator.nu. +

+

Sweave/Stangle allows the document to specify the split=TRUE +option to create a single R file for each code chunk: this will not +work for vignettes where it is assumed that each vignette source +generates a single file with the vignette extension replaced by +.R. +

+

Do watch that PDFs are not too large – one in a CRAN package +was 72MB! This is usually caused by the inclusion of overly detailed +figures, which will not render well in PDF viewers. Sometimes it is +much better to generate fairly high resolution bitmap (PNG, JPEG) +figures and include those in the PDF document. +

+ +

When R CMD build builds the vignettes, it copies these and +the vignette sources from directory vignettes to inst/doc. +To install any other files from the vignettes directory, include +a file vignettes/.install_extras which specifies these as +Perl-like regular expressions on one or more lines. (See the +description of the .Rinstignore file for full details.) +

+ +
+ + + +

1.4.1 Encodings and vignettes

+ +

Vignettes will in general include descriptive text, R input, R +output and figures, LaTeX include files and bibliographic references. +As any of these may contain non-ASCII characters, the handling +of encodings can become very complicated. +

+

The vignette source file should be written in ASCII or contain +a declaration of the encoding (see below). This applies even to +comments within the source file, since vignette engines process comments +to look for options and metadata lines. When an engine’s weave and +tangle functions are called on the vignette source, it will be converted +to the encoding of the current R session. +

+

Stangle() will produce an R code file in the current locale’s +encoding: for a non-ASCII vignette what that is recorded in a +comment at the top of the file. +

+

Sweave() will produce a .tex file in the current +encoding, or in UTF-8 if that is declared. Non-ASCII encodings +need to be declared to LaTeX via a line like +

+
\usepackage[utf8]{inputenc}
+
+

(It is also possible to use the more recent ‘inputenx’ LaTeX +package.) For files where this line is not needed (e.g. chapters +included within the body of a larger document, or non-Sweave +vignettes), the encoding may be declared using a comment like +

+
%!\VignetteEncoding{UTF-8}
+
+

If the encoding is UTF-8, this can also be declared using +the declaration +

+
%!\SweaveUTF8
+
+

If no declaration is given in the vignette, it will be assumed to be +in the encoding declared for the package. If there is no encoding +declared in either place, then it is an error to use non-ASCII +characters in the vignette. +

+

In any case, be aware that LaTeX may require the ‘usepackage’ +declaration. +

+

Sweave() will also parse and evaluate the R code in each +chunk. The R output will also be in the current locale (or UTF-8 +if so declared), and should +be covered by the ‘inputenc’ declaration. One thing people often +forget is that the R output may not be ASCII even for +ASCII R sources, for many possible reasons. One common one +is the use of ‘fancy’ quotes: see the R help on sQuote: note +carefully that it is not portable to declare UTF-8 or CP1252 to cover +such quotes, as their encoding will depend on the locale used to run +Sweave(): this can be circumvented by setting +options(useFancyQuotes="UTF-8") in the vignette. +

+

The final issue is the encoding of figures – this applies only to PDF +figures and not PNG etc. The PDF figures will contain declarations for +their encoding, but the Sweave option pdf.encoding may need to be +set appropriately: see the help for the pdf() graphics device. +

+

As a real example of the complexities, consider the fortunes +package version ‘1.4-0’. That package did not have a declared +encoding, and its vignette was in ASCII. However, the data it +displays are read from a UTF-8 CSV file and will be assumed to be in the +current encoding, so fortunes.tex will be in UTF-8 in any locale. +Had read.table been told the data were UTF-8, fortunes.tex +would have been in the locale’s encoding. +

+
+ + + +

1.4.2 Non-Sweave vignettes

+ +

R 3.0.0 and later allow vignettes in formats other than Sweave by +means of “vignette engines”. For example knitr version 1.1 +or later can create .tex files from a variation on Sweave format, +and .html files from a variation on “markdown” format. These +engines replace the Sweave() function with other functions to +convert vignette source files into LaTeX files for processing into +.pdf, or directly into .pdf or .html files. The +Stangle() function is replaced with a function that extracts the +R source from a vignette. +

+

R recognizes non-Sweave vignettes using filename extensions specified +by the engine. For example, the knitr package supports +the extension .Rmd (standing for +“R markdown”). The user indicates the vignette engine +within the vignette source using a \VignetteEngine line, for example +

+
%\VignetteEngine{knitr::knitr}
+
+

This specifies the name of a package and an engine to use in place of +Sweave in processing the vignette. As Sweave is the only engine +supplied with the R distribution, the package providing any other +engine must be specified in the ‘VignetteBuilder’ field of the +package DESCRIPTION file, and also specified in the +‘Suggests’, ‘Imports’ or ‘Depends’ field (since its +namespace must be available to build or check your package). If more +than one package is specified as a builder, they will be searched in the +order given there. The utils package is always implicitly +appended to the list of builder packages, but may be included earlier +to change the search order. +

+

Note that a package with non-Sweave vignettes should always have a +‘VignetteBuilder’ field in the DESCRIPTION file, since this +is how R CMD check recognizes that there are vignettes to be +checked: packages listed there are required when the package is checked. +

+

The vignette engine can produce .tex, .pdf, or .html +files as output. If it produces .tex files, R will +call texi2pdf to convert them to .pdf for display +to the user (unless there is a Makefile in the vignettes +directory). +

+

Package writers who would like to supply vignette engines need +to register those engines in the package .onLoad function. +For example, that function could make the call +

+
tools::vignetteEngine("knitr", weave = vweave, tangle = vtangle,
+		      pattern = "[.]Rmd$", package = "knitr")
+
+

(The actual registration in knitr is more complicated, because +it supports other input formats.) See the ?tools::vignetteEngine +help topic for details on engine registration. +

+ +
+ + + +

1.5 Package namespaces

+ + +

R has a namespace management system for code in packages. This +system allows the package writer to specify which variables in the +package should be exported to make them available to package +users, and which variables should be imported from other +packages. +

+

The namespace for a package is specified by the +NAMESPACE file in the top level package directory. This file +contains namespace directives describing the imports and exports +of the namespace. Additional directives register any shared objects to +be loaded and any S3-style methods that are provided. Note that +although the file looks like R code (and often has R-style +comments) it is not processed as R code. Only very simple +conditional processing of if statements is implemented. +

+

Packages are loaded and attached to the search path by calling +library or require. Only the exported variables are +placed in the attached frame. Loading a package that imports variables +from other packages will cause these other packages to be loaded as well +(unless they have already been loaded), but they will not be +placed on the search path by these implicit loads. Thus code in the +package can only depend on objects in its own namespace and its imports +(including the base namespace) being visible48. +

+

Namespaces are sealed once they are loaded. Sealing means that +imports and exports cannot be changed and that internal variable +bindings cannot be changed. Sealing allows a simpler implementation +strategy for the namespace mechanism. Sealing also allows code +analysis and compilation tools to accurately identify the definition +corresponding to a global variable reference in a function body. +

+

The namespace controls the search strategy for variables used by +functions in the package. If not found locally, R searches the +package namespace first, then the imports, then the base namespace and +then the normal search path. +

+

Prior to R 2.14.0, namespaces were optional in packages: a default +namespace was generated on installation in 2.14.x and 2.15.x. As from +3.0.0 a namespace is mandatory. +

+ + + + + + + + + + + +
+ + + +

1.5.1 Specifying imports and exports

+ +

Exports are specified using the export directive in the +NAMESPACE file. A directive of the form +

+ +
+
export(f, g)
+
+ +

specifies that the variables f and g are to be exported. +(Note that variable names may be quoted, and reserved words and +non-standard names such as [<-.fractions must be.) +

+

For packages with many variables to export it may be more convenient to +specify the names to export with a regular expression using +exportPattern. The directive +

+ +
+
exportPattern("^[^\\.]")
+
+ +

exports all variables that do not start with a period. However, such +broad patterns are not recommended for production code: it is better to +list all exports or use narrowly-defined groups. (This pattern applies +to S4 classes.) Beware of patterns which include names starting with a +period: some of these are internal-only variables and should never be +exported, e.g. ‘.__S3MethodsTable__.’ (and the code nowadays +excludes known cases). +

+

Packages implicitly import the base namespace. +Variables exported from other packages with namespaces need to be +imported explicitly using the directives import and +importFrom. The import directive imports all exported +variables from the specified package(s). Thus the directives +

+ +
+
import(foo, bar)
+
+ +

specifies that all exported variables in the packages foo and +bar are to be imported. If only some of the exported variables +from a package are needed, then they can be imported using +importFrom. The directive +

+ +
+
importFrom(foo, f, g)
+
+ +

specifies that the exported variables f and g of the +package foo are to be imported. Using importFrom +selectively rather than import is good practice and recommended +notably when importing from packages with more than a dozen exports. +

+

It is possible to export variables from a namespace which it has +imported from other namespaces: this has to be done explicitly and not +via exportPattern. +

+

If a package only needs a few objects from another package it can use a +fully qualified variable reference in the code instead of a formal +import. A fully qualified reference to the function f in package +foo is of the form foo::f. This is slightly less efficient +than a formal import and also loses the advantage of recording all +dependencies in the NAMESPACE file (but they still need to be +recorded in the DESCRIPTION file). Evaluating foo::f will +cause package foo to be loaded, but not attached, if it was not +loaded already—this can be an advantage in delaying the loading of a +rarely used package. +

+

Using foo:::f instead of foo::f allows access to +unexported objects. This is generally not recommended, as the +semantics of unexported objects may be changed by the package author +in routine maintenance. +

+
+ + + +

1.5.2 Registering S3 methods

+ +

The standard method for S3-style UseMethod dispatching might fail +to locate methods defined in a package that is imported but not attached +to the search path. To ensure that these methods are available the +packages defining the methods should ensure that the generics are +imported and register the methods using S3method directives. If +a package defines a function print.foo intended to be used as a +print method for class foo, then the directive +

+ +
+
S3method(print, foo)
+
+ +

ensures that the method is registered and available for UseMethod +dispatch, and the function print.foo does not need to be exported. +Since the generic print is defined in base it does not need +to be imported explicitly. +

+

(Note that function and class names may be quoted, and reserved words +and non-standard names such as [<- and function must +be.) +

+

It is possible to specify a third argument to S3method, the function to +be used as the method, for example +

+
+
S3method(print, check_so_symbols, .print.via.format)
+
+ +

when print.check_so_symbols is not needed. +

+

There used to be a limit on the number of S3method directives: it +was 500 prior to R 3.0.2. +

+
+ + + +

1.5.3 Load hooks

+ + + +

There are a number of hooks called as packages are loaded, attached, +detached, and unloaded. See help(".onLoad") for more details. +

+

Since loading and attaching are distinct operations, separate hooks are +provided for each. These hook functions are called .onLoad and +.onAttach. They both take arguments49 libname and +pkgname; they should be defined in the namespace but not +exported. +

+ + + +

Packages can use a .onDetach (as from R 3.0.0) or .Last.lib +function (provided the latter is exported from the namespace) when +detach is called on the package. It is called with a single +argument, the full path to the installed package. There is also a hook +.onUnload which is called when the namespace is unloaded +(via a call to unloadNamespace, perhaps called by +detach(unload = TRUE)) with argument the full path to the installed +package’s directory. .onUnload and .onDetach should be +defined in the namespace and not exported, but .Last.lib does +need to be exported. +

+

Packages are not likely to need .onAttach (except perhaps for a +start-up banner); code to set options and load shared objects should be +placed in a .onLoad function, or use made of the useDynLib +directive described next. +

+

User-level hooks are also available: see the help on function +setHook. +

+

These hooks are often used incorrectly. People forget to export +.Last.lib. Compiled code should be loaded in .onLoad (or +via a useDynLb directive: see below) and unloaded in +.onUnload. Do remember that a package’s namespace can be loaded +without the namespace being attached (e.g. by pkgname::fun) and +that a package can be detached and re-attached whilst its namespace +remains loaded. +

+
+ +
+

+Next: , Previous: , Up: Package namespaces   [Contents][Index]

+
+ +

1.5.4 useDynLib

+ +

A NAMESPACE file can contain one or more useDynLib +directives which allows shared objects that need to be +loaded.50 The directive +

+ +
+
useDynLib(foo)
+
+ +

registers the shared object foo51 for loading with library.dynam. +Loading of registered object(s) occurs after the package code has been +loaded and before running the load hook function. Packages that would +only need a load hook function to load a shared object can use the +useDynLib directive instead. +

+

The useDynLib directive also accepts the names of the native +routines that are to be used in R via the .C, .Call, +.Fortran and .External interface functions. These are given as +additional arguments to the directive, for example, +

+
+
useDynLib(foo, myRoutine, myOtherRoutine)
+
+ +

By specifying these names in the useDynLib directive, the native +symbols are resolved when the package is loaded and R variables +identifying these symbols are added to the package’s namespace with +these names. These can be used in the .C, .Call, +.Fortran and .External calls in place of the name of the +routine and the PACKAGE argument. For instance, we can call the +routine myRoutine from R with the code +

+
+
 .Call(myRoutine, x, y)
+
+ +

rather than +

+
+
 .Call("myRoutine", x, y, PACKAGE = "foo")
+
+ +

There are at least two benefits to this approach. Firstly, the symbol +lookup is done just once for each symbol rather than each time the +routine is invoked. Secondly, this removes any ambiguity in resolving +symbols that might be present in several compiled DLLs. +

+

In some circumstances, there will already be an R variable in the +package with the same name as a native symbol. For example, we may have +an R function in the package named myRoutine. In this case, +it is necessary to map the native symbol to a different R variable +name. This can be done in the useDynLib directive by using named +arguments. For instance, to map the native symbol name myRoutine +to the R variable myRoutine_sym, we would use +

+
+
useDynLib(foo, myRoutine_sym = myRoutine, myOtherRoutine)
+
+ +

We could then call that routine from R using the command +

+
+
 .Call(myRoutine_sym, x, y)
+
+ +

Symbols without explicit names are assigned to the R variable with +that name. +

+

In some cases, it may be preferable not to create R variables in the +package’s namespace that identify the native routines. It may be too +costly to compute these for many routines when the package is loaded +if many of these routines are not likely to be used. In this case, +one can still perform the symbol resolution correctly using the DLL, +but do this each time the routine is called. Given a reference to the +DLL as an R variable, say dll, we can call the routine +myRoutine using the expression +

+
+
 .Call(dll$myRoutine, x, y)
+
+ +

The $ operator resolves the routine with the given name in the +DLL using a call to getNativeSymbol. This is the same +computation as above where we resolve the symbol when the package is +loaded. The only difference is that this is done each time in the case +of dll$myRoutine. +

+

In order to use this dynamic approach (e.g., dll$myRoutine), one +needs the reference to the DLL as an R variable in the package. The +DLL can be assigned to a variable by using the variable = +dllName format used above for mapping symbols to R variables. For +example, if we wanted to assign the DLL reference for the DLL +foo in the example above to the variable myDLL, we would +use the following directive in the NAMESPACE file: +

+
+
myDLL = useDynLib(foo, myRoutine_sym = myRoutine, myOtherRoutine)
+
+ +

Then, the R variable myDLL is in the package’s namespace and +available for calls such as myDLL$dynRoutine to access routines +that are not explicitly resolved at load time. +

+

If the package has registration information (see Registering native routines), then we can use that directly rather than specifying the +list of symbols again in the useDynLib directive in the +NAMESPACE file. Each routine in the registration information is +specified by giving a name by which the routine is to be specified along +with the address of the routine and any information about the number and +type of the parameters. Using the .registration argument of +useDynLib, we can instruct the namespace mechanism to create +R variables for these symbols. For example, suppose we have the +following registration information for a DLL named myDLL: +

+
+
static R_NativePrimitiveArgType foo_t[] = {
+    REALSXP, INTSXP, STRSXP, LGLSXP
+};
+
+static R_CMethodDef cMethods[] = {
+   {"foo", (DL_FUNC) &foo, 4, foo_t},
+   {"bar_sym", (DL_FUNC) &bar, 0},
+   {NULL, NULL, 0}
+};
+
+static R_CallMethodDef callMethods[] = {
+   {"R_call_sym", (DL_FUNC) &R_call, 4},
+   {"R_version_sym", (DL_FUNC) &R_version, 0},
+   {NULL, NULL, 0}
+};
+
+ +

Then, the directive in the NAMESPACE file +

+
+
useDynLib(myDLL, .registration = TRUE)
+
+ +

causes the DLL to be loaded and also for the R variables foo, +bar_sym, R_call_sym and R_version_sym to be +defined in the package’s namespace. +

+

Note that the names for the R variables are taken from the entry in +the registration information and do not need to be the same as the name +of the native routine. This allows the creator of the registration +information to map the native symbols to non-conflicting variable names +in R, e.g. R_version to R_version_sym for use in an +R function such as +

+
+
R_version <- function()
+{
+  .Call(R_version_sym)
+}
+
+ +

Using argument .fixes allows an automatic prefix to be added to +the registered symbols, which can be useful when working with an +existing package. For example, package KernSmooth has +

+
+
useDynLib(KernSmooth, .registration = TRUE, .fixes = "F_")
+
+ +

which makes the R variables corresponding to the FORTRAN symbols +F_bkde and so on, and so avoid clashes with R code in the +namespace. +

+ + +
+ + + +

1.5.5 An example

+ +

As an example consider two packages named foo and bar. The +R code for package foo in file foo.R is +

+
+
+
+
x <- 1
+f <- function(y) c(x,y)
+foo <- function(x) .Call("foo", x, PACKAGE="foo")
+print.foo <- function(x, ...) cat("<a foo>\n")
+
+
+
+ +

Some C code defines a C function compiled into DLL foo (with an +appropriate extension). The NAMESPACE file for this package is +

+
+
+
+
useDynLib(foo)
+export(f, foo)
+S3method(print, foo)
+
+
+
+ +

The second package bar has code file bar.R +

+
+
+
+
c <- function(...) sum(...)
+g <- function(y) f(c(y, 7))
+h <- function(y) y+9
+
+
+
+ +

and NAMESPACE file +

+
+
+
+
import(foo)
+export(g, h)
+
+
+
+ +

Calling library(bar) loads bar and attaches its exports to +the search path. Package foo is also loaded but not attached to +the search path. A call to g produces +

+
+
> g(6)
+[1]  1 13
+
+ +

This is consistent with the definitions of c in the two settings: +in bar the function c is defined to be equivalent to +sum, but in foo the variable c refers to the +standard function c in base. +

+
+ +
+

+Previous: , Up: Package namespaces   [Contents][Index]

+
+ +

1.5.6 Namespaces with S4 classes and methods

+ +

Some additional steps are needed for packages which make use of formal +(S4-style) classes and methods (unless these are purely used +internally). The package should have Depends: methods in its +DESCRIPTION file52 and import(methods) or +importFrom(methods, ...) plus any classes and methods which are +to be exported need to be declared in the NAMESPACE file. For +example, the stats4 package has +

+ + + +
+
export(mle) # exporting methods implicitly exports the generic
+importFrom("graphics", plot)
+importFrom("stats", optim, qchisq)
+## For these, we define methods or (AIC, BIC, nobs) an implicit generic:
+importFrom("stats", AIC, BIC, coef, confint, logLik, nobs, profile,
+	   update, vcov)
+exportClasses(mle, profile.mle, summary.mle)
+## All methods for imported generics:
+exportMethods(coef, confint, logLik, plot, profile, summary,
+	      show, update, vcov)
+## implicit generics which do not have any methods here
+export(AIC, BIC, nobs)
+
+ + + +

All S4 classes to be used outside the package need to be listed in an +exportClasses directive. Alternatively, they can be specified +using exportClassPattern53 in the same style as +for exportPattern. To export methods for generics from other +packages an exportMethods directive can be used. +

+

Note that exporting methods on a generic in the namespace will also +export the generic, and exporting a generic in the namespace will also +export its methods. If the generic function is not local to this +package, either because it was imported as a generic function or because +the non-generic version has been made generic solely to add S4 methods +to it (as for functions such as plot in the example above), it +can be declared via either or both of export or +exportMethods, but the latter is clearer (and is used in the +stats4 example above). In particular, for primitive functions +there is no generic function, so export would export the +primitive, which makes no sense. On the other hand, if the generic is +local to this package, it is more natural to export the function itself +using export(), and this must be done if an implicit +generic is created without setting any methods for it (as is the case +for AIC in stats4). +

+

A non-local generic function is only exported to ensure that calls to +the function will dispatch the methods from this package (and that is +not done or required when the methods are for primitive functions). For +this reason, you do not need to document such implicitly created generic +functions, and undoc in package tools will not report them. +

+

If a package uses S4 classes and methods exported from another package, +but does not import the entire namespace of the other +package54, it needs +to import the classes and methods explicitly, with directives +

+ + + +
+
importClassesFrom(package, ...)
+importMethodsFrom(package, ...)
+
+ +

listing the classes and functions with methods respectively. Suppose we +had two small packages A and B with B using A. +Then they could have NAMESPACE files +

+
+
+
+
export(f1, ng1)
+exportMethods("[")
+exportClasses(c1)
+
+
+
+ +

and +

+
+
+
+
importFrom(A, ng1)
+importClassesFrom(A, c1)
+importMethodsFrom(A, f1)
+export(f4, f5)
+exportMethods(f6, "[")
+exportClasses(c1, c2)
+
+
+
+ +

respectively. +

+

Note that importMethodsFrom will also import any generics defined +in the namespace on those methods. +

+

It is important if you export S4 methods that the corresponding generics +are available. You may for example need to import plot from +graphics to make visible a function to be converted into its +implicit generic. But it is better practice to make use of the generics +exported by stats4 as this enables multiple packages to +unambiguously set methods on those generics. +

+
+ + + +

1.6 Writing portable packages

+ +

This section contains advice on writing packages to be used on multiple +platforms or for distribution (for example to be submitted to a package +repository such as CRAN). +

+ + + + + + + + +

Portable packages should have simple file names: use only alphanumeric +ASCII characters and period (.), and avoid those names +not allowed under Windows which are mentioned above. +

+

Many of the graphics devices are platform-specific: even X11() +(aka x11()) which although emulated on Windows may not be +available on a Unix-alike (and is not the preferred screen device on OS +X). It is rarely necessary for package code or examples to open a new +device, but if essential,55 use dev.new(). +

+

Use R CMD build to make the release .tar.gz file. +

+

R CMD check provides a basic set of checks, but often further +problems emerge when people try to install and use packages submitted to +CRAN – many of these involve compiled code. Here are some +further checks that you can do to make your package more portable. +

+
    +
  • If your package has a configure script, provide a +configure.win script to be used on Windows (an empty file if no +actions are needed). + +
  • If your package has a Makevars or Makefile file, make sure +that you use only portable make features. Such files should be +LF-terminated56 (including the final +line of the file) and not make use of GNU extensions. (The POSIX +specification is available at +http://pubs.opengroup.org/onlinepubs/9699919799/utilities/make.html; +anything not documented there should be regarded as an extension to be +avoided.) Commonly misused GNU extensions are conditional inclusions +(ifeq and the like), ${shell ...} and ${wildcard +...}, and the use of +=57 and :=. Also, the use of $< other than in +implicit rules is a GNU extension, as is the $^ macro. +Unfortunately makefiles which use GNU extensions often run on other +platforms but do not have the intended results. + +

    The use of ${shell ...} can be avoided by using backticks, e.g. +

    +
    +
    PKG_CPPFLAGS = `gsl-config --cflags`
    +
    + +

    which works in all versions of make known58 to be used with +R. +

    +

    If you really must require GNU make, declare it in the DESCRIPTION +file by +

    +
    +
    SystemRequirements: GNU make
    +
    + +

    If you only need GNU make for parts of the package which are rarely +needed (for example to create bibliography files under +vignettes), use a file called GNUmakefile rather than +Makefile as GNU make (only) will use the former. +

    +

    Since the only viable make for Windows is GNU make, it is permissible to +use GNU extensions in files Makevars.win or Makefile.win. +

    +
  • Bash extensions also need to be avoided in shell scripts, including +expressions in Makefiles (which are passed to the shell for processing). +Some R platforms use strict59 Bourne shells: the R toolset on Windows and some +Unix-alike OSes use ash +(https://en.wikipedia.org/wiki/Almquist_shell), a rather +minimal shell with few builtins. Beware of assuming that all the POSIX +command-line utilities are available, especially on Windows where only a +minimal set is provided for use with R. +(See The command line tools in R Installation and Administration.) +One particular issue is the use of echo, for which two +behaviours are allowed +(http://pubs.opengroup.org/onlinepubs/9699919799/utilities/echo.html) +and both occur as defaults on R platforms: portable applications +should not use -n (as the first argument) nor escape sequences. + +
  • Make use of the abilities of your compilers to check the +standards-conformance of your code. For example, gcc can be +used with options -Wall -pedantic to alert you to potential +problems. This is particularly important for C++, where g++ -Wall +-pedantic will alert you to the use of GNU extensions which fail to +compile on most other C++ compilers. If R was not configured +accordingly, one can achieve this via personal Makevars +files. +See Customizing package compilation in R Installation and Administration, + +

    Although there is a 2011 version of the C++ standard, even partial +implementations are not universally available. Portable C++ code needs +to follow the 1998 standard (and not use features from C99). See also +Using C++11 code to specify a C++11 compiler. +

    +

    If you use FORTRAN 77, ftnchek +(http://www.dsm.fordham.edu/~ftnchek/) provides thorough testing +of conformance to the standard. +

    +

    Not all common R platforms conform to the expected standards, e.g. +C99 for C code. One common area of problems is the *printf +functions where Windows does not support %lld, %Lf and +similar formats (and has its own formats such as %I64d for 64-bit +integers). It is very rare to need to output such types, and 64-bit +integers can usually be converted to doubles for output. +

    +
  • Do be very careful with passing arguments between R, C and +FORTRAN code. In particular, long in C will be 32-bit +on some R platforms (including 64-bit Windows), but 64-bit on most +modern Unix and Linux platforms. It is rather unlikely that the use of +long in C code has been thought through: if you need a longer +type than int you should use a configure test for a C99 type such +as int_fast64_t (and failing that, long long 60) and typedef your own type +to be long or long long, or use another suitable type +(such as size_t). + +

    It is not safe to assume that long and pointer types are the same +size, and they are not on 64-bit Windows. If you need to convert +pointers to and from integers use the C99 integer types intptr_t +and uintptr_t (which are defined in the header <stdint.h> +and are not required to be implemented by the C99 standard). +

    +

    Note that integer in FORTRAN corresponds to int +in C on all R platforms. +

    +
  • Under no circumstances should your compiled code ever call abort +or exit61: these terminate the user’s R process, quite possibly +including all his unsaved work. One usage that could call abort +is the assert macro in C or C++ functions, which should never be +active in production code. The normal way to ensure that is to define +the macro NDEBUG, and R CMD INSTALL does so as part of +the compilation flags. If you wish to use assert during +development. you can include -UNDEBUG in PKG_CPPFLAGS. +Note that your own src/Makefile or makefiles in sub-directories +may also need to define NDEBUG. + +

    This applies not only to your own code but to any external software you +compile in or link to. +

    +
  • Compiled code should not write to stdout or stderr and C++ +and Fortran I/O should not be used. As with the previous item such +calls may come from external software and may never be called, but +package authors are often mistaken about that. + +
  • Compiled code should not call the system random number generators such +as rand, drand48 and random62, but rather use the +interfaces to R’s RNGs described in Random numbers. In +particular, if more than one package initializes the system RNG (e.g. +via srand), they will interfere with each other. + +

    Nor should the C++11 random number library be used. +

    +
  • Errors in memory allocation and reading/writing outside arrays are very +common causes of crashes (e.g., segfaults) on some machines. +See Checking memory access for tools which can be used to look for this. + +
  • Many platforms will allow unsatisfied entry points in compiled code, but +will crash the application (here R) if they are ever used. Some +(notably Windows) will not. Looking at the output of + +
    +
    nm -pg mypkg.so
    +
    + +

    and checking if any of the symbols marked U is unexpected is a +good way to avoid this. +

    +
  • Linkers have a lot of freedom in how to resolve entry points in +dynamically-loaded code, so the results may differ by platform. One +area that has caused grief is packages including copies of standard +system software such as libz (especially those already linked +into R). In the case in point, entry point gzgets was +sometimes resolved against the old version compiled into the package, +sometimes against the copy compiled into R and sometimes against the +system dynamic library. The only safe solution is to rename the entry +points in the copy in the package. We have even seen problems with +entry point name myprintf, which is a system entry +point63 on some Linux systems. + +
  • Conflicts between symbols in DLLs are handled in very platform-specific +ways. Good ways to avoid trouble are to make as many symbols as +possible static (check with nm -pg), and to use names which are +clearly tied to your package (which also helps users if anything does go +wrong). Note that symbol names starting with R_ are regarded as +part of R’s namespace and should not be used in packages. + +
  • It is not portable to call compiled code in R or other packages +via .Internal, .C, .Fortran, .Call or +.External, since such interfaces are subject to change without +notice and will probably result in your code terminating the R +process. + +
  • Do not use (hard or symbolic) file links in your package sources. +Where possible R CMD build will replace them by copies. + +
  • If you do not yourself have a Windows system, consider submitting your +source package to WinBuilder (http://win-builder.r-project.org/) +before distribution. + +
  • It is bad practice for package code to alter the search path using +library, require or attach and this often does not +work as intended. For alternatives, see Suggested packages and +with. + +
  • Examples can be run interactively via example as well as +in batch mode when checking. So they should behave appropriately in +both scenarios, conditioning by interactive() the parts which +need an operator or observer. For instance, progress +bars64 are only appropriate in +interactive use, as is displaying help pages or calling View() +(see below). + +
  • Be careful with the order of entries in macros such as PKG_LIBS. +Some linkers will re-order the entries, and behaviour can differ between +dynamic and static libraries. Generally -L options should +precede65 the libraries (typically +specified by -l options) to be found from those directories, +and libraries are searched once in the order they are specified. Not +all linkers allow a space after -L . + +
  • The ar utility is often used in makefiles to make static +libraries. Its modifier u is defined by POSIX but is disabled in +GNU ar on some recent Linux distributions which use +‘deterministic mode’. The safe way to make a static library is to first +remove any existing file of that name then use ar -cr and then +ranlib if needed (which is system-dependent: on most +systems66 ar always +maintains a symbol table). + +
  • Some people have a need to set a locale. Locale names are not portable, +and e.g. ‘fr_FR.utf8’ is common used on Linux but not accepted on +either Solaris or OS X. ‘fr_FR.UTF-8’ is more portable, being +accepted on recent Linux, AIX, FreeBSD, OS X and Solaris (at least). +However, some Linux distributions micro-package, so locales defined by +glibc (including these examples) may not be installed. + +
+ +

Do be careful in what your tests (and examples) actually test. Bad +practice seen in distributed packages include: +

+
    +
  • It is not reasonable to test the time taken by a command: you cannot +know how fast or how heavily loaded an R platform might be. At best +you can test a ratio of times, and even that is fraught with +difficulties. + +
  • Do not test the exact format of R error messages: They change, and +they can be translated. + +

    Worse, packages have tested the exact format of system error messages, +which are platform-dependent and perhaps locale-dependent. +

    +
  • If you use functions such as View, remember that in testing there +is no one to look at the output. It is better to use something like one of +
    +
    if(interactive()) View(obj) else print(head(obj))
    +if(interactive()) View(obj) else str(obj)
    +
    + +
  • Only test the accuracy of results if you have done a formal error +analysis. Things such as checking that probabilities numerically sum to +one are silly: numerical tests should always have a tolerance. That the +tests on your platform achieve a particular tolerance says little about +other platforms. R is configured by default to make use of long +doubles where available, but they may not be available or be too slow +for routine use. Most R platforms use ‘ix86’ or +‘x86_64’ CPUs: these use extended precision registers on some +but not all of their FPU instructions. Thus the achieved precision can +depend on the compiler version and optimization flags—our experience +is that 32-bit builds tend to be less precise than 64-bit ones. But not +all platforms use those CPUs, and not all67 which use them configure them to +allow the use of extended precision. In particular, ARM CPUs do not +(currently) have extended precision nor long doubles, and long double +was 64-bit on HP/PA Linux. + +

    If you must try to establish a tolerance empirically, configure and +build R with --disable-long-double and use appropriate +compiler flags (such as -ffloat-store and +-fexcess-precision=standard for gcc, depending on the +CPU type68) to +mitigate the effects of extended-precision calculations. +

    +

    Tests which involve random inputs or non-deterministic algorithms should +normally set a seed or be tested for many seeds. +

    +
+ +
+ + + +

1.6.1 PDF size

+ +

There are a several tools available to reduce the size of PDF files: +often the size can be reduced substantially with no or minimal loss in +quality. Not only do large files take up space: they can stress the PDF +viewer and take many minutes to print (if they can be printed at all). +

+

qpdf (http://qpdf.sourceforge.net/) can compress +losslessly. It is fairly readily available (e.g. it has binaries for +Windows and packages in Debian/Ubuntu/Fedora, and is installed as part +of the CRAN OS X distribution of R). R CMD build +has an option to run qpdf over PDF files under inst/doc +and replace them if at least 10Kb and 10% is saved. The full path to +the qpdf command can be supplied as environment variable +R_QPDF (and is on the CRAN binary of R for OS X). It seems +MiKTeX does not use PDF object compression and so qpdf can +reduce considerably the files it outputs: MiKTeX can be overridden by +code in the preamble of an Sweave or LaTeX file — see how this is +done for the R reference manual at +https://svn.r-project.org/R/trunk/doc/manual/refman.top. +

+

Other tools can reduce the size of PDFs containing bitmap images at +excessively high resolution. These are often best re-generated (for +example Sweave defaults to 300 ppi, and 100–150 is more +appropriate for a package manual). These tools include Adobe Acrobat +(not Reader), Apple’s Preview69 and Ghostscript (which +converts PDF to PDF by +

+
+
ps2pdf options -dAutoRotatePages=/None in.pdf out.pdf
+
+ +

and suitable options might be +

+
+
-dPDFSETTINGS=/ebook
+-dPDFSETTINGS=/screen
+
+ +

; see http://www.ghostscript.com/doc/current/Ps2pdf.htm for +more such and consider all the options for image downsampling). There +have been examples in CRAN packages for which Ghostscript 9.06 +and later produced much better reductions than 9.05 or earlier. +

+

We come across occasionally large PDF files containing excessively +complicated figures using PDF vector graphics: such figures are often +best redesigned or failing that, output as PNG files. +

+

Option --compact-vignettes to R CMD build defaults to +value ‘qpdf’: use ‘both’ to try harder to reduce the size, +provided you have Ghostscript available (see the help for +tools::compactPDF). +

+
+ + + +

1.6.2 Check timing

+ +

There are several ways to find out where time is being spent in the +check process. Start by setting the environment variable +_R_CHECK_TIMINGS_ to ‘0’. This will report the total CPU +times (not Windows) and elapsed times for installation and running +examples, tests and vignettes, under each sub-architecture if +appropriate. For tests and vignettes, it reports the time for each as +well as the total. +

+

Setting _R_CHECK_TIMINGS_ to a positive value sets a threshold (in +seconds elapsed time) for reporting timings. +

+

If you need to look in more detail at the timings for examples, use +option --timings to R CMD check (this is set by +--as-cran). This adds a summary to the check output for all +the examples with CPU or elapsed time of more than 5 seconds. It +produces a file mypkg.Rcheck/mypkg-Ex.timings +containing timings for each help file: it is a tab-delimited file which +can be read into R for further analysis. +

+

Timings for the tests and vignette runs are given at the bottom of the +corresponding log file: note that log files for successful vignette runs +are only retained if environment variable +_R_CHECK_ALWAYS_LOG_VIGNETTE_OUTPUT_ is set to a true value. +

+ +
+ + + +

1.6.3 Encoding issues

+ +

Care is needed if your package contains non-ASCII text, and in +particular if it is intended to be used in more than one locale. It is +possible to mark the encoding used in the DESCRIPTION file and in +.Rd files, as discussed elsewhere in this manual. +

+

First, consider carefully if you really need non-ASCII text. +Many users of R will only be able to view correctly text in their +native language group (e.g. Western European, Eastern European, +Simplified Chinese) and ASCII.70. Other characters may not be rendered at all, +rendered incorrectly, or cause your R code to give an error. For +.Rd documentation, marking the encoding and including +ASCII transliterations is likely to do a reasonable job. The +set of characters which is commonly supported is wider than it used to +be around 2000, but non-Latin alphabets (Greek, Russian, Georgian, +…) are still often problematic and those with double-width +characters (Chinese, Japanese, Korean) often need specialist fonts to +render correctly. +

+

Several CRAN packages have messages in their R code in French (and a +few in German). A better way to tackle this is to use the +internationalization facilities discussed elsewhere in this manual. +

+

Function showNonASCIIfile in package tools can help in +finding non-ASCII bytes in files. +

+

There is a portable way to have arbitrary text in character strings +(only) in your R code, which is to supply them in Unicode as +\uxxxx escapes. If there are any characters not in the current +encoding the parser will encode the character string as UTF-8 and mark +it as such. This applies also to character strings in datasets: they +can be prepared using \uxxxx escapes or encoded in UTF-8 in a +UTF-8 locale, or even converted to UTF-8 via ‘iconv()’. If you do +this, make sure you have ‘R (>= 2.10)’ (or later) in the +‘Depends’ field of the DESCRIPTION file. +

+

R sessions running in non-UTF-8 locales will if possible re-encode +such strings for display (and this is done by RGui on Windows, +for example). Suitable fonts will need to be selected or made +available71 both for the console/terminal and graphics devices such as +‘X11()’ and ‘windows()’. Using ‘postscript’ or +‘pdf’ will choose a default 8-bit encoding depending on the +language of the UTF-8 locale, and your users would need to be told how +to select the ‘encoding’ argument. +

+

If you want to run R CMD check on a Unix-alike over a package +that sets a package encoding in its DESCRIPTION file and do +not use a UTF-8 locale you may need to specify a suitable locale +via environment variable R_ENCODING_LOCALES. The default +is equivalent to the value +

+
+
"latin1=en_US:latin2=pl_PL:UTF-8=en_US.UTF-8:latin9=fr_FR.iso885915@euro"
+
+ +

(which is appropriate for a system based on glibc: OS X requires +latin9=fr_FR.ISO8859-15) except that if the current locale is +UTF-8 then the package code is translated to UTF-8 for syntax checking, +so it is strongly recommended to check in a UTF-8 locale. +

+
+ + + +

1.6.4 Portable C and C++ code

+ +

Writing portable C and C++ code is mainly a matter of observing the +standards (C99, C++98 or where declared C++11) and testing that +extensions (such as POSIX functions) are supported. However, some +common errors are worth pointing out here. It can be helpful to look up +functions at http://www.cplusplus.com/reference/ or +http://en.cppreference.com/w/ and compare what is defined in the +various standards. +

+
    +
  • Mathematical functions such as sqrt are defined in C++ for +floating-point arguments. It is legitimate in C++ to overload these +with versions for types float, double, long double +and possibly more. This means that calling sqrt on an integer +type may have ‘overloading ambiguity’ as it could be promoted to any of +the supported floating-point types: this is commonly seen on Solaris, +but for pow also seen on OS X. (C++11 requires additional +overloads for integer types.) + +

    A not-uncommonly-seen problem is to mistakenly call floor(x/y) or +ceil(x/y) for int arguments x and y. Since +x/y does integer division, the result is an int and +‘overloading ambiguity’ may be reported. +

    +
  • Function fabs is defined only for floating-point types, except in +C++11 which has overloads for std::fabs in <cmath> for +integer types. Function abs is defined in C99’s +<stdlib.h> for int and in C++98’s <cstdlib> for +integer types, overloaded in <cmath> for floating-point types. +C++11 has additional overloads for std::abs in <cmath> for +integer types. The effect of calling abs for a floating-point +type is implementation-specific: it may truncate to an integer. + +
  • Functions/macros such as isnan, isinf and isfinite +are not required by C++98: where compilers support them they may be only +in the std namespace or only in the main namespace. There is no +way to make use of these functions which works with all C++ compilers +currently in use on R platforms: use R’s versions such as +ISNAN and R_FINITE instead. + +

    It is an error (and make little sense, although has been seen) to call +these functions for integer arguments. +

    +
  • The GNU compilers have a large number of non-portable extensions. For +example, INFINITY (which is in C99 but not C++98), for which R +provides the portable R_PosInf (and R_NegInf for +-INFINITY). And NAN is just one NaN value: in R code +NA_REAL is usually what is intended, but R_NaN is also +available. + +

    Some (but not all) extensions are listed at +https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html and +https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Extensions.html. +

    +
  • Including C headers in C++ code is not portable. Including the C +header math.h in C++ code often causes conflicts with +cmath which may be included by other headers. This is +particularly problematic with C++11 compilers, as functions like +sqrt and isnan are defined for double arguments in +math.h and for a range of types including double in +cmath. + +
  • Variable-length arrays are C99, not supported by C++98 nor by the C++ +compilers in use with R on some platforms. + +
  • Be careful to include the headers which define the functions you use. +Some compilers/OSes include other system headers in their headers which +are not required by the standards, and so code may compile on such +systems and not on others. (A prominent example is the C++11 header +<random> which is indirectly included by <algorithm> by +g++. Another frequent issue is the C header <time.h> +which is included by other headers on Linux and Windows but not OS X nor +Solaris.) + +
  • For C++ code, be careful to specify namespaces where needed. Many +functions are defined by the standards to be in the std +namespace, but g++ puts many such also in the C++ main +namespace. One way to do so is to use declarations such as +
    +
    using std::floor;
    +
    + +
  • Macros defined by the compiler/OS can cause problems. Identifiers +starting with an underscore followed by an upper-case letter or another +underscore are reserved for system macros and should not be used in +portable code (including not as guards in C/C++ headers). Other macros, +typically upper-case, may be defined by the compiler or system headers +and can cause problems. +The most common issue involves the names of the Intel CPU registers such +as CS, DS, ES, FS, GS and SS +(and more with longer abbreviations) defined on i586/x64 Solaris in +<sys/regset.h> and often included indirectly by <stdlib.h> +and other core headers. + +
  • typedefs in OS headers can conflict with those in the package: an +example is index_t defined in <sys/types.h> on Solaris. +
+ +

Some additional information for C++ is available at +http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Plummer.pdf +by Martyn Plummer. +

+
+ + + +

1.6.5 Binary distribution

+ +

If you want to distribute a binary version of a package on Windows or OS +X, there are further checks you need to do to check it is portable: it +is all too easy to depend on external software on your own machine that +other users will not have. +

+

For Windows, check what other DLLs your package’s DLL depends on +(‘imports’ from in the DLL tools’ parlance). A convenient GUI-based +tool to do so is ‘Dependency Walker’ +(http://www.dependencywalker.com/) for both 32-bit and 64-bit +DLLs – note that this will report as missing links to R’s own DLLs +such as R.dll and Rblas.dll. For 32-bit DLLs only, the +command-line tool pedump.exe -i (in Rtools*.exe) can be +used, and for the brave, the objdump tool in the appropriate +toolchain will also reveal what DLLs are imported from. If you use a +toolchain other than one provided by the R developers or use your own +makefiles, watch out in particular for dependencies on the toolchain’s +runtime DLLs such as libgfortran, libstdc++ and +libgcc_s. +

+

For OS X, using R CMD otool -L on the package’s shared object(s) +in the libs directory will show what they depend on: watch for +any dependencies in /usr/local/lib, notably +libgfortran.2.dylib, libgfortran.3.dylib or +libquadmath.0.dylib. +

+

Many people (including the CRAN package repository) will not +accept source packages containing binary files as the latter are a +security risk. If you want to distribute a source package which needs +external software on Windows or OS X, options include +

    +
  • To arrange for installation of the package to download the +additional software from a URL, as e.g. package Cairo does. + +
  • (For CRAN.) +To negotiate with Uwe Ligges to host the additional components on +WinBuilder, and write a configure.win file to install them. +There used to be many examples, e.g. package rgdal (however +nowadays CRAN prefers to use a uniform cross-compilation approach for +software such as GDAL). + +
+ +

Be aware that license requirements will need to be met so you may need +to supply the sources for the additional components (and will if your +package has a GPL-like license). +

+ +
+ + + +

1.7 Diagnostic messages

+ +

Diagnostic messages can be made available for translation, so it is +important to write them in a consistent style. Using the tools +described in the next section to extract all the messages can give a +useful overview of your consistency (or lack of it). +Some guidelines follow. +

+
    +
  • Messages are sentence fragments, and not viewed in isolation. So it is +conventional not to capitalize the first word and not to end with a +period (or other punctuation). + +
  • Try not to split up messages into small pieces. In C error messages use +a single format string containing all English words in the messages. + +

    In R error messages do not construct a message with paste (such +messages will not be translated) but via multiple arguments to +stop or warning, or via gettextf. +

    +
  • Do not use colloquialisms such as “can’t” and “don’t”. + +
  • Conventionally single quotation marks are used for quotations such as + +
    +
    'ord' must be a positive integer, at most the number of knots
    +
    + +

    and double quotation marks when referring to an R character string or +a class, such as +

    +
    +
    'format' must be "normal" or "short" - using "normal"
    +
    + +

    Since ASCII does not contain directional quotation marks, it +is best to use ‘'’ and let the translator (including automatic +translation) use directional quotations where available. The range of +quotation styles is immense: unfortunately we cannot reproduce them in a +portable texinfo document. But as a taster, some languages use +‘up’ and ‘down’ (comma) quotes rather than left or right quotes, and +some use guillemets (and some use what Adobe calls ‘guillemotleft’ to +start and others use it to end). +

    +

    In R messages it is also possible to use sQuote or dQuote as in +

    +
    +
    	stop(gettextf("object must be of class %s or %s",
    +		      dQuote("manova"), dQuote("maov")),
    +	     domain = NA)
    +
    + +
  • Occasionally messages need to be singular or plural (and in other +languages there may be no such concept or several plural forms – +Slovenian has four). So avoid constructions such as was once used in +library + +
    +
    if((length(nopkgs) > 0) && !missing(lib.loc)) {
    +    if(length(nopkgs) > 1)
    +	warning("libraries ",
    +		paste(sQuote(nopkgs), collapse = ", "),
    +		" contain no packages")
    +    else
    +	warning("library ", paste(sQuote(nopkgs)),
    +		" contains no package")
    +}
    +
    + +

    and was replaced by +

    +
    +
    if((length(nopkgs) > 0) && !missing(lib.loc)) {
    +    pkglist <- paste(sQuote(nopkgs), collapse = ", ")
    +    msg <- sprintf(ngettext(length(nopkgs),
    +			    "library %s contains no packages",
    +			    "libraries %s contain no packages",
    +			    domain = "R-base"),
    +		   pkglist)
    +    warning(msg, domain=NA)
    +}
    +
    + +

    Note that it is much better to have complete clauses as here, since +in another language one might need to say +‘There is no package in library %s’ or +‘There are no packages in libraries %s’. +

    +
+ +
+ + + +

1.8 Internationalization

+ +

There are mechanisms to translate the R- and C-level error and warning +messages. There are only available if R is compiled with NLS support +(which is requested by configure option --enable-nls, +the default). +

+

The procedures make use of msgfmt and xgettext which are +part of GNU gettext and this will need to be installed: +Windows users can find pre-compiled binaries at +https://www.stats.ox.ac.uk/pub/Rtools/goodies/gettext-tools.zip. +

+ + + + + + +
+ + + +

1.8.1 C-level messages

+ +

The process of enabling translations is +

+
    +
  • In a header file that will be included in all the C (or C++ or Objective +C/C++) files containing messages that should be translated, declare + +
    +
    #include <R.h>  /* to include Rconfig.h */
    +
    +#ifdef ENABLE_NLS
    +#include <libintl.h>
    +#define _(String) dgettext ("pkg", String)
    +/* replace pkg as appropriate */
    +#else
    +#define _(String) (String)
    +#endif
    +
    + +
  • For each message that should be translated, wrap it in _(...), +for example + +
    +
    error(_("'ord' must be a positive integer"));
    +
    + +

    If you want to use different messages for singular and plural forms, you +need to add +

    +
    +
    #ifndef ENABLE_NLS
    +#define dngettext(pkg, String, StringP, N) (N > 1 ? StringP : String)
    +#endif
    +
    + +

    and mark strings by +

    +
    +
    dngettext(("pkg", <singular string>, <plural string>, n)
    +
    + +
  • In the package’s src directory run + +
    +
    xgettext --keyword=_ -o pkg.pot *.c
    +
    + +
+ +

The file src/pkg.pot is the template file, and +conventionally this is shipped as po/pkg.pot. +

+
+ + + +

1.8.2 R messages

+ +

Mechanisms are also available to support the automatic translation of +R stop, warning and message messages. They make +use of message catalogs in the same way as C-level messages, but using +domain R-pkg rather than pkg. Translation of +character strings inside stop, warning and message +calls is automatically enabled, as well as other messages enclosed in +calls to gettext or gettextf. (To suppress this, use +argument domain=NA.) +

+

Tools to prepare the R-pkg.pot file are provided in package +tools: xgettext2pot will prepare a file from all strings +occurring inside gettext/gettextf, stop, +warning and message calls. Some of these are likely to be +spurious and so the file is likely to need manual editing. +xgettext extracts the actual calls and so is more useful when +tidying up error messages. +

+

The R function ngettext provides an interface to the C +function of the same name: see example in the previous section. It is +safest to use domain="R-pkg" explicitly in calls to +ngettext, and necessary for earlier versions of R unless they +are calls directly from a function in the package. +

+ +
+ +
+

+Previous: , Up: Internationalization   [Contents][Index]

+
+ +

1.8.3 Preparing translations

+ +

Once the template files have been created, translations can be made. +Conventional translations have file extension .po and are placed +in the po subdirectory of the package with a name that is either +‘ll.po’ or ‘R-ll.po’ for translations of the C and R +messages respectively to language with code ‘ll’. +

+

See Localization of messages in R Installation and Administration, for details of language codes. +

+

There is an R function, update_pkg_po in package tools, +to automate much of the maintenance of message translations. See its +help for what it does in detail. +

+

If this is called on a package with no existing translations, it creates +the directory pkgdir/po, creates a template file of R +messages, pkgdir/po/R-pkg.pot, within it, creates the +‘en@quot’ translation and installs that. (The ‘en@quot’ +pseudo-language interprets quotes in their directional forms in suitable +(e.g. UTF-8) locales.) +

+

If the package has C source files in its src directory +that are marked for translation, use +

+
+
touch pkgdir/po/pkg.pot
+
+ +

to create a dummy template file, then call update_pkg_po again +(this can also be done before it is called for the first time). +

+

When translations to new languages are added in the pkgdir/po +directory, running the same command will check and then +install the translations. +

+

If the package sources are updated, the same command will update the +template files, merge the changes into the translation .po files +and then installed the updated translations. You will often see that +merging marks translations as ‘fuzzy’ and this is reported in the +coverage statistics. As fuzzy translations are not used, this is +an indication that the translation files need human attention. +

+

The merged translations are run through tools::checkPofile to +check that C-style formats are used correctly: if not the mismatches are +reported and the broken translations are not installed. +

+

This function needs the GNU gettext-tools installed and on the +path: see its help page. +

+ + + +
+ + + +

1.9 CITATION files

+ +

An installed file named CITATION will be used by the +citation() function. (It should be in the inst +subdirectory of the package sources.) +

+

The CITATION file is parsed as R code (in the package’s +declared encoding, or in ASCII if none is declared). If no +such file is present, citation auto-generates citation +information from the package DESCRIPTION metadata, and an example +of what that would look like as a CITATION file can be seen in +recommended package nlme (see below): recommended packages +boot, cluster and mgcv have further +examples. +

+

A CITATION file will contain calls to function bibentry. +

+

Here is that for nlme: +

+
+
year <- sub("-.*", "", meta$Date)
+note <- sprintf("R package version %s", meta$Version)
+
+bibentry(bibtype = "Manual",
+         title = "{nlme}: Linear and Nonlinear Mixed Effects Models",
+         author = c(person("Jose", "Pinheiro"),
+                    person("Douglas", "Bates"),
+                    person("Saikat", "DebRoy"),
+                    person("Deepayan", "Sarkar"),
+                    person("R Core Team")),
+         year = year,
+         note = note,
+         url = "https://CRAN.R-project.org/package=nlme")
+
+ +

Note the way that information that may need to be updated is picked up +from object meta, a parsed version of the DESCRIPTION +file72 – it is +tempting to hardcode such information, but it normally then gets +outdated. See ?bibentry for further details of the information +which can be provided. +

+

In case a bibentry contains LaTeX markup (e.g., for accented +characters or mathematical symbols), it may be necessary to provide a +text representation to be used for printing via the textVersion +argument to bibentry. E.g., earlier versions of +nlme additionally used +

+
+
         textVersion =
+         paste0("Jose Pinheiro, Douglas Bates, Saikat DebRoy,",
+                "Deepayan Sarkar and the R Core Team (",
+                year,
+                "). nlme: Linear and Nonlinear Mixed Effects Models. ",
+                note, ".")
+
+ +

The CITATION file should itself produce no output when +source-d. +

+

It is desirable (and essential for CRAN) that the +CITATION file does not contain calls to functions such as +packageDescription which assume the package is installed in a +library tree on the package search path. +

+
+ +
+

+Next: , Previous: , Up: Creating R packages   [Contents][Index]

+
+ +

1.10 Package types

+ +

The DESCRIPTION file has an optional field Type which if +missing is assumed to be ‘Package’, the sort of extension discussed +so far in this chapter. Currently one other type is recognized; there +used also to be a ‘Translation’ type. +

+ + + + +
+ +
+

+Previous: , Up: Package types   [Contents][Index]

+
+ +

1.10.1 Frontend

+ +

This is a rather general mechanism, designed for adding new front-ends +such as the former gnomeGUI package (see the Archive area on +CRAN). If a configure file is found in the top-level +directory of the package it is executed, and then if a Makefile +is found (often generated by configure), make is called. +If R CMD INSTALL --clean is used make clean is called. No +other action is taken. +

+

R CMD build can package up this type of extension, but R +CMD check will check the type and skip it. +

+

Many packages of this type need write permission for the R +installation directory. +

+
+ +
+

+Previous: , Up: Creating R packages   [Contents][Index]

+
+ +

1.11 Services

+ +

Several members of the R project have set up services to assist those +writing R packages, particularly those intended for public +distribution. +

+

win-builder.r-project.org +offers the automated preparation of (32/64-bit) Windows binaries from +well-tested source packages. +

+

R-Forge (R-Forge.r-project.org) and +RForge (www.rforge.net) are similar +services with similar names. Both provide source-code management +through SVN, daily building and checking, mailing lists and a repository +that can be accessed via install.packages (they can be +selected by setRepositories and the GUI menus that use it). +Package developers have the opportunity to present their work on the +basis of project websites or news announcements. Mailing lists, forums +or wikis provide useRs with convenient instruments for discussions and +for exchanging information between developers and/or interested useRs. +

+ +
+ + + +

2 Writing R documentation files

+ + + + + + + + + + + + + + + + + + + + + +
+ + + +

2.1 Rd format

+ +

R objects are documented in files written in “R documentation” +(Rd) format, a simple markup language much of which closely resembles +(La)TeX, which can be processed into a variety of formats, +including LaTeX, HTML and plain text. The translation is +carried out by functions in the tools package called by the +script Rdconv in R_HOME/bin and by the +installation scripts for packages. +

+

The R distribution contains more than 1300 such files which can be +found in the src/library/pkg/man directories of the R +source tree, where pkg stands for one of the standard packages +which are included in the R distribution. +

+

As an example, let us look at a simplified version of +src/library/base/man/load.Rd which documents the R function +load. +

+
+
+
+
% File src/library/base/man/load.Rd
+\name{load}
+\alias{load}
+\title{Reload Saved Datasets}
+\description{
+  Reload the datasets written to a file with the function
+  \code{save}.
+}
+\usage{
+load(file, envir = parent.frame())
+}
+\arguments{
+  \item{file}{a connection or a character string giving the
+    name of the file to load.}
+  \item{envir}{the environment where the data should be
+    loaded.}
+}
+\seealso{
+  \code{\link{save}}.
+}
+\examples{
+## save all data
+save(list = ls(), file= "all.RData")
+
+## restore the saved values to the current environment
+load("all.RData")
+
+## restore the saved values to the workspace
+load("all.RData", .GlobalEnv)
+}
+\keyword{file}
+
+
+
+ +

An Rd file consists of three parts. The header gives basic +information about the name of the file, the topics documented, a title, +a short textual description and R usage information for the objects +documented. The body gives further information (for example, on the +function’s arguments and return value, as in the above example). +Finally, there is an optional footer with keyword information. The +header is mandatory. +

+

Information is given within a series of sections with standard +names (and user-defined sections are also allowed). Unless otherwise +specified73 these should occur only once in an Rd +file (in any order), and the processing software will retain only the +first occurrence of a standard section in the file, with a warning. +

+

See “Guidelines for Rd +files” for guidelines for writing documentation in Rd format +which should be useful for package writers. + +The R +generic function prompt is used to construct a bare-bones Rd +file ready for manual editing. Methods are defined for documenting +functions (which fill in the proper function and argument names) and +data frames. There are also functions promptData, +promptPackage, promptClass, and promptMethods for +other types of Rd file. +

+

The general syntax of Rd files is summarized below. For a detailed +technical discussion of current Rd syntax, see +“Parsing Rd files”. +

+

Rd files consist of four types of text input. The most common +is LaTeX-like, with the backslash used as a prefix on markup +(e.g. \alias), and braces used to indicate arguments +(e.g. {load}). The least common type of text is ‘verbatim’ +text, where no markup other than the comment marker (%) is +processed. There is also a rare variant of ‘verbatim’ text +(used in \eqn, \deqn, \figure, +and \newcommand) where comment markers need not be escaped. +The final type is R-like, intended for R code, but allowing some +embedded macros. Quoted strings within R-like text are handled +specially: regular character escapes such as \n may be entered +as-is. Only markup starting with \l (e.g. \link) or +\v (e.g. \var) will be recognized within quoted strings. +The rarely used vertical tab \v must be entered as \\v. +

+

Each macro defines the input type for its argument. For example, the +file initially uses LaTeX-like syntax, and this is also used in the +\description section, but the \usage section uses +R-like syntax, and the \alias macro uses ‘verbatim’ syntax. +Comments run from a percent symbol % to the end of the line in +all types of text except the rare ‘verbatim’ variant +(as on the first line of the load example). +

+

Because backslashes, braces and percent symbols have special meaning, to +enter them into text sometimes requires escapes using a backslash. In +general balanced braces do not need to be escaped, but percent symbols +always do, except in the ‘verbatim’ variant. +For the complete list of macros and rules for escapes, see +“Parsing Rd files”. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: Rd format   [Contents][Index]

+
+ +

2.1.1 Documenting functions

+ +

The basic markup commands used for documenting R objects (in +particular, functions) are given in this subsection. +

+
+
\name{name}
+
+

name typically74 is the basename of +the Rd file containing the documentation. It is the “name” of +the Rd object represented by the file and has to be unique in a +package. To avoid problems with indexing the package manual, it may not +contain ‘!’ ‘|’ nor ‘@’, and to avoid possible problems +with the HTML help system it should not contain ‘/’ nor a space. +(LaTeX special characters are allowed, but may not be collated +correctly in the index.) There can only be one \name entry in a +file, and it must not contain any markup. Entries in the package manual +will be in alphabetic75 order +of the \name entries. +

+
+
\alias{topic}
+
+

The \alias sections specify all “topics” the file documents. +This information is collected into index data bases for lookup by the +on-line (plain text and HTML) help systems. The topic can +contain spaces, but (for historical reasons) leading and trailing spaces +will be stripped. Percent and left brace need to be escaped by +a backslash. +

+

There may be several \alias entries. Quite often it is +convenient to document several R objects in one file. For example, +file Normal.Rd documents the density, distribution function, +quantile function and generation of random variates for the normal +distribution, and hence starts with +

+
+
\name{Normal}
+\alias{Normal}
+\alias{dnorm}
+\alias{pnorm}
+\alias{qnorm}
+\alias{rnorm}
+
+ +

Also, it is often convenient to have several different ways to refer to +an R object, and an \alias does not need to be the name of an +object. +

+

Note that the \name is not necessarily a topic documented, and if +so desired it needs to have an explicit \alias entry (as in this +example). +

+
+
\title{Title}
+
+

Title information for the Rd file. This should be capitalized +and not end in a period; try to limit its length to at most 65 +characters for widest compatibility. +

+

Markup is supported in the text, but use of characters other than +English text and punctuation (e.g., ‘<’) may limit portability. +

+

There must be one (and only one) \title section in a help file. +

+
+
\description{…}
+
+

A short description of what the function(s) do(es) (one paragraph, a few +lines only). (If a description is too long and cannot easily be +shortened, the file probably tries to document too much at once.) +This is mandatory except for package-overview files. +

+
+
\usage{fun(arg1, arg2, …)}
+
+

One or more lines showing the synopsis of the function(s) and variables +documented in the file. These are set in typewriter font. This is an +R-like command. +

+

The usage information specified should match the function definition +exactly (such that automatic checking for consistency between +code and documentation is possible). +

+

It is no longer advisable to use \synopsis for the actual +synopsis and show modified synopses in the \usage. Support for +\synopsis will be removed in \R 3.1.0. To indicate that a +function can be used in several different ways, depending on the named +arguments specified, use section \details. E.g., +abline.Rd contains +

+
+
\details{
+  Typical usages are
+\preformatted{abline(a, b, untf = FALSE, \dots)
+......
+}
+
+ + +

Use \method{generic}{class} to indicate the name +of an S3 method for the generic function generic for objects +inheriting from class "class". In the printed versions, +this will come out as generic (reflecting the understanding that +methods should not be invoked directly but via method dispatch), but +codoc() and other QC tools always have access to the full name. +

+

For example, print.ts.Rd contains +

+
+
\usage{
+\method{print}{ts}(x, calendar, \dots)
+}
+
+ +

which will print as +

+
+
Usage:
+
+     ## S3 method for class ‘ts’:
+     print(x, calendar, ...)
+
+ +

Usage for replacement functions should be given in the style of +dim(x) <- value rather than explicitly indicating the name of the +replacement function ("dim<-" in the above). Similarly, one +can use \method{generic}{class}(arglist) <- +value to indicate the usage of an S3 replacement method for the generic +replacement function "generic<-" for objects inheriting +from class "class". +

+

Usage for S3 methods for extracting or replacing parts of an object, S3 +methods for members of the Ops group, and S3 methods for user-defined +(binary) infix operators (‘%xxx%’) follows the above rules, +using the appropriate function names. E.g., Extract.factor.Rd +contains +

+
+
\usage{
+\method{[}{factor}(x, \dots, drop = FALSE)
+\method{[[}{factor}(x, \dots)
+\method{[}{factor}(x, \dots) <- value
+}
+
+ +

which will print as +

+
+
Usage:
+
+     ## S3 method for class ‘factor’:
+     x[..., drop = FALSE]
+     ## S3 method for class ‘factor’:
+     x[[...]]
+     ## S3 replacement method for class ‘factor’:
+     x[...] <- value
+
+ + +

\S3method is accepted as an alternative to \method. +

+
+
\arguments{…}
+
+

Description of the function’s arguments, using an entry of the form +

+
+
\item{arg_i}{Description of arg_i.}
+
+ +

for each element of the argument list. (Note that there is +no whitespace between the three parts of the entry.) There may be +optional text outside the \item entries, for example to give +general information about groups of parameters. +

+ +
+
\details{…}
+
+

A detailed if possible precise description of the functionality +provided, extending the basic information in the \description +slot. +

+
+
\value{…}
+
+

Description of the function’s return value. +

+

If a list with multiple values is returned, you can use entries of the +form +

+
+
\item{comp_i}{Description of comp_i.}
+
+ +

for each component of the list returned. Optional text may +precede76 this +list (see for example the help for rle). Note that \value +is implicitly a \describe environment, so that environment should +not be used for listing components, just individual \item{}{} +entries. +

+
+
\references{…}
+
+

A section with references to the literature. Use \url{} or +\href{}{} for web pointers. +

+
+
\note{...}
+
+

Use this for a special note you want to have pointed out. Multiple +\note sections are allowed, but might be confusing to the end users. +

+

For example, pie.Rd contains +

+
+
\note{
+  Pie charts are a very bad way of displaying information.
+  The eye is good at judging linear measures and bad at
+  judging relative areas.
+  ......
+}
+
+ +
+
\author{…}
+
+

Information about the author(s) of the Rd file. Use +\email{} without extra delimiters (such as ‘( )’ or +‘< >’) to specify email addresses, or \url{} or +\href{}{} for web pointers. +

+
+
\seealso{…}
+
+

Pointers to related R objects, using \code{\link{...}} to +refer to them (\code is the correct markup for R object names, +and \link produces hyperlinks in output formats which support +this. See Marking text, and Cross-references). +

+ +
+
\examples{…}
+

Examples of how to use the function. Code in this section is set +in typewriter font without reformatting and is run by +example() unless marked otherwise (see below). +

+

Examples are not only useful for documentation purposes, but also +provide test code used for diagnostic checking of R code. By +default, text inside \examples{} will be displayed in the +output of the help page and run by example() and by R CMD +check. You can use \dontrun{} + +for text that should only be shown, but not run, and +\dontshow{} + +for extra commands for testing that should not be shown to users, but +will be run by example(). (Previously this was called +\testonly, and that is still accepted.) +

+

Text inside \dontrun{} is ‘verbatim’, but the other parts +of the \examples section are R-like text. +

+

For example, +

+
+
x <- runif(10)       # Shown and run.
+\dontrun{plot(x)}    # Only shown.
+\dontshow{log(x)}    # Only run.
+
+ +

Thus, example code not included in \dontrun must be executable! +In addition, it should not use any system-specific features or require +special facilities (such as Internet access or write permission to +specific directories). Text included in \dontrun is indicated by +comments in the processed help files: it need not be valid R code but +the escapes must still be used for %, \ and unpaired +braces as in other ‘verbatim’ text. +

+

Example code must be capable of being run by example, which uses +source. This means that it should not access stdin, +e.g. to scan() data from the example file. +

+

Data needed for making the examples executable can be obtained by random +number generation (for example, x <- rnorm(100)), or by using +standard data sets listed by data() (see ?data for more +info). +

+

Finally, there is \donttest, used (at the beginning of a separate +line) to mark code that should be run by example() but not by +R CMD check (by default: as from R 3.2.0 the option +--run-donttest can be used). This should be needed only +occasionally but can be used for code which might fail in circumstances +that are hard to test for, for example in some locales. (Use +e.g. capabilities() or nzchar(Sys.which("someprogram")) to +test for features needed in the examples wherever possible, and you can +also use try() or tryCatch(). Use interactive() to +condition examples which need someone to interact with.) Note that code +included in \donttest must be correct R code, and any packages +used should be declared in the DESCRIPTION file. It is good +practice to include a comment in the \donttest section explaining +why it is needed. +

+ +
+
\keyword{key}
+

There can be zero or more \keyword sections per file. +Each \keyword section should specify a single keyword, preferably +one of the standard keywords as listed in file KEYWORDS in the +R documentation directory (default R_HOME/doc). Use +e.g. RShowDoc("KEYWORDS") to inspect the standard keywords from +within R. There can be more than one \keyword entry if the R +object being documented falls into more than one category, or none. +

+

Do strongly consider using \concept (see Indices) instead of +\keyword if you are about to use more than very few non-standard +keywords. +

+

The special keyword ‘internal’ marks a page of internal objects +that are not part of the package’s API. If the help page for object +foo has keyword ‘internal’, then help(foo) gives this +help page, but foo is excluded from several object indices, +including the alphabetical list of objects in the HTML help system. +

+

help.search() can search by keyword, including user-defined +values: however the ‘Search Engine & Keywords’ HTML page accessed +via help.start() provides single-click access only to a +pre-defined list of keywords. +

+
+ + +
+ + + +

2.1.2 Documenting data sets

+ +

The structure of Rd files which document R data sets is slightly +different. Sections such as \arguments and \value are not +needed but the format and source of the data should be explained. +

+

As an example, let us look at src/library/datasets/man/rivers.Rd +which documents the standard R data set rivers. +

+
+
+
+
\name{rivers}
+\docType{data}
+\alias{rivers}
+\title{Lengths of Major North American Rivers}
+\description{
+  This data set gives the lengths (in miles) of 141 \dQuote{major}
+  rivers in North America, as compiled by the US Geological
+  Survey.
+}
+\usage{rivers}
+\format{A vector containing 141 observations.}
+\source{World Almanac and Book of Facts, 1975, page 406.}
+\references{
+  McNeil, D. R. (1977) \emph{Interactive Data Analysis}.
+  New York: Wiley.
+}
+\keyword{datasets}
+
+
+
+ +

This uses the following additional markup commands. +

+
+
\docType{…}
+

Indicates the “type” of the documentation object. Always ‘data’ +for data sets, and ‘package’ for pkg-package.Rd +overview files. Documentation for S4 methods and classes uses +‘methods’ (from promptMethods()) and ‘class’ (from +promptClass()). +

+
+
\format{…}
+
+

A description of the format of the data set (as a vector, matrix, data +frame, time series, …). For matrices and data frames this should +give a description of each column, preferably as a list or table. +See Lists and tables, for more information. +

+
+
\source{…}
+
+

Details of the original source (a reference or URL, +see Specifying URLs). In addition, section \references could +give secondary sources and usages. +

+
+ +

Note also that when documenting data set bar, +

+
    +
  • The \usage entry is always bar or (for packages +which do not use lazy-loading of data) data(bar). (In +particular, only document a single data object per Rd file.) +
  • The \keyword entry should always be ‘datasets’. +
+ +

If bar is a data frame, documenting it as a data set can +be initiated via prompt(bar). Otherwise, the promptData +function may be used. +

+
+ + + +

2.1.3 Documenting S4 classes and methods

+ +

There are special ways to use the ‘?’ operator, namely +‘class?topic’ and ‘methods?topic’, to access +documentation for S4 classes and methods, respectively. This mechanism +depends on conventions for the topic names used in \alias +entries. The topic names for S4 classes and methods respectively are of +the form +

+
+
class-class
+generic,signature_list-method
+
+ +

where signature_list contains the names of the classes in the +signature of the method (without quotes) separated by ‘,’ (without +whitespace), with ‘ANY’ used for arguments without an explicit +specification. E.g., ‘genericFunction-class’ is the topic name for +documentation for the S4 class "genericFunction", and +‘coerce,ANY,NULL-method’ is the topic name for documentation for +the S4 method for coerce for signature c("ANY", "NULL"). +

+

Skeletons of documentation for S4 classes and methods can be generated +by using the functions promptClass() and promptMethods() +from package methods. If it is necessary or desired to provide an +explicit function declaration (in a \usage section) for an S4 +method (e.g., if it has “surprising arguments” to be mentioned +explicitly), one can use the special markup +

+
+
\S4method{generic}{signature_list}(argument_list)
+
+ +

(e.g., ‘\S4method{coerce}{ANY,NULL}(from, to)’). +

+

To make full use of the potential of the on-line documentation system, +all user-visible S4 classes and methods in a package should at least +have a suitable \alias entry in one of the package’s Rd files. +If a package has methods for a function defined originally somewhere +else, and does not change the underlying default method for the +function, the package is responsible for documenting the methods it +creates, but not for the function itself or the default method. +

+

An S4 replacement method is documented in the same way as an S3 one: see +the description of \method in Documenting functions. +

+ +

See help("Documentation", package = "methods") for more +information on using and creating on-line documentation for S4 classes and +methods. +

+
+ + + +

2.1.4 Documenting packages

+ +

Packages may have an overview help page with an \alias +pkgname-package, e.g. ‘utils-package’ for the +utils package, when package?pkgname will open that +help page. If a topic named pkgname does not exist in +another Rd file, it is helpful to use this as an additional +\alias. +

+

Skeletons of documentation for a package can be generated using the +function promptPackage(). If the final = LIBS argument +is used, then the Rd file will be generated in final form, containing +the information that would be produced up to +library(help = pkgname). Otherwise (the default) comments +will be inserted giving suggestions for content. +

+

Apart from the mandatory \name and \title and the +pkgname-package alias, the only requirement for the package +overview page is that it include a \docType{package} statement. +All other content is optional. We suggest that it should be a short +overview, to give a reader unfamiliar with the package enough +information to get started. More extensive documentation is better +placed into a package vignette (see Writing package vignettes) and +referenced from this page, or into individual man pages for the +functions, datasets, or classes. +

+
+ + + +

2.2 Sectioning

+ +

To begin a new paragraph or leave a blank line in an example, just +insert an empty line (as in (La)TeX). To break a line, use +\cr. + +

+

In addition to the predefined sections (such as \description{}, +\value{}, etc.), you can “define” arbitrary ones by +\section{section_title}{…}. + +For example +

+
+
\section{Warning}{
+  You must not call this function unless …
+}
+
+ +

For consistency with the pre-assigned sections, the section name (the +first argument to \section) should be capitalized (but not all +upper case). Whitespace between the first and second braced expressions +is not allowed. Markup (e.g. \code) within the section title +may cause problems with the latex conversion (depending on the version +of macro packages such as ‘hyperref’) and so should be avoided. +

+

The \subsection macro takes arguments in the same format as +\section, but is used within a section, so it may be used to +nest subsections within sections or other subsections. There is no +predefined limit on the nesting level, but formatting is not designed +for more than 3 levels (i.e. subsections within subsections within +sections). +

+

Note that additional named sections are always inserted at a fixed +position in the output (before \note, \seealso and the +examples), no matter where they appear in the input (but in the same +order amongst themselves as in the input). +

+ +
+ + + +

2.3 Marking text

+ + +

The following logical markup commands are available for emphasizing or +quoting text. +

+
+
\emph{text}
+
+
+
\strong{text}
+
+

Emphasize text using italic and bold font if +possible; \strong is regarded as stronger (more emphatic). +

+
+
\bold{text}
+
+

Set text in bold font where possible. +

+
+
\sQuote{text}
+
+
+
\dQuote{text}
+
+

Portably single or double quote text (without hard-wiring the +characters used for quotation marks). +

+
+ +

Each of the above commands takes LaTeX-like input, so other macros +may be used within text. +

+

The following logical markup commands are available for indicating +specific kinds of text. Except as noted, these take ‘verbatim’ text +input, and so other macros may not be used within them. Some characters +will need to be escaped (see Insertions). +

+
+
\code{text}
+
+

Indicate text that is a literal example of a piece of an R program, +e.g., a fragment of R code or the name of an R object. Text is +entered in R-like syntax, and displayed using typewriter font +where possible. Macros \var and \link are interpreted within +text. +

+
+
\preformatted{text}
+
+

Indicate text that is a literal example of a piece of a program. Text +is displayed using typewriter font where possible. Formatting, +e.g. line breaks, is preserved. (Note that this includes a line break +after the initial {, so typically text should start on the same line as +the command.) +

+

Due to limitations in LaTeX as of this writing, this macro may not be +nested within other markup macros other than \dQuote and +\sQuote, as errors or bad formatting may result. +

+
+
\kbd{keyboard-characters}
+
+

Indicate keyboard input, using slanted typewriter font if +possible, so users can distinguish the characters they are supposed to +type from computer output. Text is entered ‘verbatim’. +

+
+
\samp{text}
+
+

Indicate text that is a literal example of a sequence of characters, +entered ‘verbatim’. No wrapping or reformatting will occur. Displayed +using typewriter font where possible. +

+ +
+
\verb{text}
+
+

Indicate text that is a literal example of a sequence of characters, +with no interpretation of e.g. \var, but which will be included +within word-wrapped text. Displayed using typewriter font if +possible. +

+
+
\pkg{package_name}
+
+

Indicate the name of an R package. LaTeX-like. +

+
+
\file{file_name}
+
+

Indicate the name of a file. Text is LaTeX-like, so backslash needs +to be escaped. Displayed using a distinct font where possible. +

+
+
\email{email_address}
+
+

Indicate an electronic mail address. LaTeX-like, will be rendered as +a hyperlink in HTML and PDF conversion. Displayed using +typewriter font where possible. +

+
+
\url{uniform_resource_locator}
+
+

Indicate a uniform resource locator (URL) for the World Wide +Web. The argument is handled as ‘verbatim’ text (with percent and +braces escaped by backslash), and rendered as a hyperlink in HTML and +PDF conversion. Linefeeds are removed, and as from R 3.2.0 leading +and trailing whitespace77 is removed. See Specifying URLs. +

+

Displayed using typewriter font where possible. +

+
+
\href{uniform_resource_locator}{text}
+
+

Indicate a hyperlink to the World Wide Web. The first argument is +handled as ‘verbatim’ text (with percent and braces escaped by +backslash) and is used as the URL in the hyperlink, with the +second argument of LaTeX-like text displayed to the user. Linefeeds +are removed from the first argument, and as from R 3.2.0 leading and +trailing whitespace is removed. +

+

Note that RFC3986-encoded URLs (e.g. using ‘\%28VS.85\%29’ in +place of ‘(VS.85)’) may not work correctly in versions of R +before 3.1.3 and are best avoided—use URLdecode() to decode +them. +

+
+
\var{metasyntactic_variable}
+
+

Indicate a metasyntactic variable. In some cases this will be rendered +distinctly, e.g. in italic, but not in all78. LaTeX-like. +

+
\env{environment_variable}
+
+

Indicate an environment variable. ‘Verbatim’. +Displayed using typewriter font where possible +

+
\option{option}
+
+

Indicate a command-line option. ‘Verbatim’. +Displayed using typewriter font where possible. +

+
\command{command_name}
+
+

Indicate the name of a command. LaTeX-like, so \var is +interpreted. Displayed using typewriter font where possible. +

+
\dfn{term}
+
+

Indicate the introductory or defining use of a term. LaTeX-like. +

+
\cite{reference}
+
+

Indicate a reference without a direct cross-reference via \link +(see Cross-references), such as the name of a book. LaTeX-like. +

+
\acronym{acronym}
+
+

Indicate an acronym (an abbreviation written in all capital letters), +such as GNU. LaTeX-like. +

+
+ + +
+ + + +

2.4 Lists and tables

+ + + + +

The \itemize and \enumerate commands take a single +argument, within which there may be one or more \item commands. +The text following each \item is formatted as one or more +paragraphs, suitably indented and with the first paragraph marked with a +bullet point (\itemize) or a number (\enumerate). +

+

Note that unlike argument lists, \item in these formats is +followed by a space and the text (not enclosed in braces). For example +

+
+
  \enumerate{
+    \item A database consists of one or more records, each with one or
+    more named fields.
+    \item Regular lines start with a non-whitespace character.
+    \item Records are separated by one or more empty lines.
+  }
+
+ +

\itemize and \enumerate commands may be nested. +

+ +

The \describe command is similar to \itemize but allows +initial labels to be specified. Each \item takes two arguments, +the label and the body of the item, in exactly the same way as an +argument or value \item. \describe commands are mapped to +<DL> lists in HTML and \description lists in LaTeX. +

+ +

The \tabular command takes two arguments. The first gives for +each of the columns the required alignment (‘l’ for +left-justification, ‘r’ for right-justification or ‘c’ for +centring.) The second argument consists of an arbitrary number of +lines separated by \cr, and with fields separated by \tab. +For example: +

+
+
  \tabular{rlll}{
+    [,1] \tab Ozone   \tab numeric \tab Ozone (ppb)\cr
+    [,2] \tab Solar.R \tab numeric \tab Solar R (lang)\cr
+    [,3] \tab Wind    \tab numeric \tab Wind (mph)\cr
+    [,4] \tab Temp    \tab numeric \tab Temperature (degrees F)\cr
+    [,5] \tab Month   \tab numeric \tab Month (1--12)\cr
+    [,6] \tab Day     \tab numeric \tab Day of month (1--31)
+  }
+
+ +

There must be the same number of fields on each line as there are +alignments in the first argument, and they must be non-empty (but can +contain only spaces). (There is no whitespace between \tabular +and the first argument, nor between the two arguments.) +

+
+ + + +

2.5 Cross-references

+ + + +

The markup \link{foo} (usually in the combination +\code{\link{foo}}) produces a hyperlink to the help for +foo. Here foo is a topic, that is the argument of +\alias markup in another Rd file (possibly in another package). +Hyperlinks are supported in some of the formats to which Rd files are +converted, for example HTML and PDF, but ignored in others, e.g. +the text format. +

+

One main usage of \link is in the \seealso section of the +help page, see Rd format. +

+

Note that whereas leading and trailing spaces are stripped when +extracting a topic from a \alias, they are not stripped when +looking up the topic of a \link. +

+ +

You can specify a link to a different topic than its name by +\link[=dest]{name} which links to topic dest +with name name. This can be used to refer to the documentation +for S3/4 classes, for example \code{"\link[=abc-class]{abc}"} +would be a way to refer to the documentation of an S4 class "abc" +defined in your package, and +\code{"\link[=terms.object]{terms}"} to the S3 "terms" +class (in package stats). To make these easy to read in the +source file, \code{"\linkS4class{abc}"} expands to the form +given above. +

+

There are two other forms of optional argument specified as +\link[pkg]{foo} and +\link[pkg:bar]{foo} to link to the package +pkg, to files foo.html and +bar.html respectively. These are rarely needed, perhaps to +refer to not-yet-installed packages (but there the HTML help system +will resolve the link at run time) or in the normally undesirable event +that more than one package offers help on a topic79 (in +which case the present package has precedence so this is only needed to +refer to other packages). They are currently only used in HTML help +(and ignored for hyperlinks in LaTeX conversions of help pages), and +link to the file rather than the topic (since there is no way to know +which topics are in which files in an uninstalled package). The +only reason to use these forms for base and recommended +packages is to force a reference to a package that might be further down +the search path. Because they have been frequently misused, the HTML +help system looks for topic foo in package pkg +if it does not find file foo.html. +

+
+ + + +

2.6 Mathematics

+ + + + +

Mathematical formulae should be set beautifully for printed +documentation yet we still want something useful for text and HTML +online help. To this end, the two commands +\eqn{latex}{ascii} and +\deqn{latex}{ascii} are used. Whereas \eqn +is used for “inline” formulae (corresponding to TeX’s +$…$), \deqn gives “displayed equations” (as in +LaTeX’s displaymath environment, or TeX’s +$$…$$). Both arguments are treated as ‘verbatim’ text. +

+

Both commands can also be used as \eqn{latexascii} (only +one argument) which then is used for both latex and +ascii. No whitespace is allowed between command and the first +argument, nor between the first and second arguments. +

+

The following example is from Poisson.Rd: +

+
+
  \deqn{p(x) = \frac{\lambda^x e^{-\lambda}}{x!}}{%
+	p(x) = \lambda^x exp(-\lambda)/x!}
+  for \eqn{x = 0, 1, 2, \ldots}.
+
+ + +

For text on-line help we get +

+
+
+
+
    p(x) = lambda^x exp(-lambda)/x!
+
+for x = 0, 1, 2, ....
+
+
+
+ +

Greek letters (both cases) will be rendered in HTML if preceded by a +backslash, \dots and \ldots will be rendered as ellipses +and \sqrt, \ge and \le as mathematical symbols. +

+

Note that only basic LaTeX can be used, there being no provision to +specify LaTeX style files such as the AMS extensions. +

+
+ + + +

2.7 Figures

+ + + +

To include figures in help pages, use the \figure markup. There +are three forms. +

+

The two commonly used simple forms are \figure{filename} +and \figure{filename}{alternate text}. This will +include a copy of the figure in either HTML or LaTeX output. In text +output, the alternate text will be displayed instead. (When the second +argument is omitted, the filename will be used.) Both the filename and +the alternate text will be parsed verbatim, and should not include +special characters that are significant in HTML or LaTeX. +

+

The expert form is \figure{filename}{options: +string}. (The word ‘options:’ must be typed exactly as +shown and followed by at least one space.) In this form, the +string is copied into the HTML img tag as attributes +following the src attribute, or into the second argument of the +\Figure macro in LaTeX, which by default is used as options to +an \includegraphics call. As it is unlikely that any single +string would suffice for both display modes, the expert form would +normally be wrapped in conditionals. It is up to the author to make +sure that legal HTML/LaTeX is used. For example, to include a +logo in both HTML (using the simple form) and LaTeX (using the +expert form), the following could be used: +

+
+
\if{html}{\figure{logo.jpg}{Our logo}}
+\if{latex}{\figure{logo.jpg}{options: width=0.5in}}
+
+ +

The files containing the figures should be stored in the directory +man/figures. Files with extensions .jpg, .jpeg, +.pdf, .png and .svg from that directory will be +copied to the help/figures directory at install time. (Figures in +PDF format will not display in most HTML browsers, but might be the +best choice in reference manuals.) Specify the filename relative to +man/figures in the \figure directive. +

+
+ +
+

+Next: , Previous: , Up: Writing R documentation files   [Contents][Index]

+
+ +

2.8 Insertions

+ + +

Use \R for the R system itself. Use \dots + +for the dots in function argument lists ‘’, and +\ldots + +for ellipsis dots in ordinary text.80 These can be followed by +{}, and should be unless followed by whitespace. +

+

After an unescaped ‘%’, you can put your own comments regarding the +help text. The rest of the line (but not the newline at the end) will +be completely disregarded. Therefore, you can also use it to make part +of the “help” invisible. +

+

You can produce a backslash (‘\’) by escaping it by another +backslash. (Note that \cr is used for generating line breaks.) +

+

The “comment” character ‘%’ and unpaired braces81 +almost always need to be escaped by ‘\’, and ‘\\’ can +be used for backslash and needs to be when there two or more adjacent +backslashes). In R-like code quoted strings are handled slightly +differently; see “Parsing Rd files” for details – in particular braces should not be +escaped in quoted strings. +

+

All of ‘% { } \’ should be escaped in LaTeX-like text. +

+ +

Text which might need to be represented differently in different +encodings should be marked by \enc, e.g. +\enc{Jöreskog}{Joreskog} (with no whitespace between the +braces) where the first argument will be used where encodings are +allowed and the second should be ASCII (and is used for e.g. +the text conversion in locales that cannot represent the encoded form). +(This is intended to be used for individual words, not whole sentences +or paragraphs.) +

+
+ + + +

2.9 Indices

+ + +

The \alias command (see Documenting functions) is used to +specify the “topics” documented, which should include all R +objects in a package such as functions and variables, data sets, and S4 +classes and methods (see Documenting S4 classes and methods). The +on-line help system searches the index data base consisting of all +alias topics. +

+ +

In addition, it is possible to provide “concept index entries” using +\concept, which can be used for help.search() lookups. +E.g., file cor.test.Rd in the standard package stats +contains +

+
+
\concept{Kendall correlation coefficient}
+\concept{Pearson correlation coefficient}
+\concept{Spearman correlation coefficient}
+
+ +

so that e.g. ??Spearman will succeed in finding the +help page for the test for association between paired samples using +Spearman’s rho. +

+ +

(Note that help.search() only uses “sections” of documentation +objects with no additional markup.) +

+

If you want to cross reference such items from other help files via +\link, you need to use \alias and not \concept. +

+ +
+ + + +

2.10 Platform-specific documentation

+ + +

Sometimes the documentation needs to differ by platform. Currently two +OS-specific options are available, ‘unix’ and ‘windows’, and +lines in the help source file can be enclosed in +

+
+
#ifdef OS
+   ...
+#endif
+
+ +

or +

+
+
#ifndef OS
+   ...
+#endif
+
+ +

for OS-specific inclusion or exclusion. Such blocks should not be +nested, and should be entirely within a block (that, is between the +opening and closing brace of a section or item), or at top-level contain +one or more complete sections. +

+

If the differences between platforms are extensive or the R objects +documented are only relevant to one platform, platform-specific Rd files +can be put in a unix or windows subdirectory. +

+
+ + + +

2.11 Conditional text

+ + + + + +

Occasionally the best content for one output format is different from +the best content for another. For this situation, the +\if{format}{text} or +\ifelse{format}{text}{alternate} markup +is used. Here format is a comma separated list of formats in +which the text should be rendered. The alternate will be +rendered if the format does not match. Both text and +alternate may be any sequence of text and markup. +

+

Currently the following formats are recognized: example, +html, latex and text. These select output for +the corresponding targets. (Note that example refers to +extracted example code rather than the displayed example in some other +format.) Also accepted are TRUE (matching all formats) and +FALSE (matching no formats). These could be the output +of the \Sexpr macro (see Dynamic pages). +

+

The \out{literal} macro would usually be used within +the text part of \if{format}{text}. It +causes the renderer to output the literal text exactly, with no +attempt to escape special characters. For example, use +the following to output the markup necessary to display the Greek letter in +LaTeX or HTML, and the text string alpha in other formats: +

+
\if{latex}{\out{\alpha}}\ifelse{html}{\out{&alpha;}}{alpha}
+
+ +
+ + + +

2.12 Dynamic pages

+ + + + +

Two macros supporting dynamically generated man pages are \Sexpr +and \RdOpts. These are modelled after Sweave, and are intended +to contain executable R expressions in the Rd file. +

+

The main argument to \Sexpr must be valid R code that can be +executed. It may also take options in square brackets before the main +argument. Depending on the options, the code may be executed at +package build time, package install time, or man page rendering time. +

+

The options follow the same format as in Sweave, but different options +are supported. Currently the allowed options and their defaults are: +

+
    +
  • eval=TRUE +Whether the R code should be evaluated. + +
  • echo=FALSE +Whether the R code should be echoed. If TRUE, a display will +be given in a preformatted block. For example, +\Sexpr[echo=TRUE]{ x <- 1 } will be displayed as +
    +
    > x <- 1
    +
    + +
  • keep.source=TRUE +Whether to keep the author’s formatting when displaying the +code, or throw it away and use a deparsed version. + +
  • results=text +How should the results be displayed? The possibilities +are: + +
      +
    • - results=text +Apply as.character() to the result of the code, and insert it +as a text element. + +
    • - results=verbatim +Print the results of the code just as if it was executed at the console, +and include the printed results verbatim. (Invisible results will not print.) + +
    • - results=rd +The result is assumed to be a character vector containing markup to be +passed to parse_Rd(), with the result inserted in place. This +could be used to insert computed aliases, for instance. +parse_Rd() is called first with fragment = FALSE to allow +a single Rd section macro to be inserted. If that fails, it is called +again with fragment = TRUE, the older behavior. + +
    • - results=hide +Insert no output. +
    + +
  • strip.white=TRUE +Remove leading and trailing white space from each line of +output if strip.white=TRUE. With +strip.white=all, also remove blank lines. + +
  • stage=install +Control when this macro is run. Possible values are +
      +
    • - stage=build +The macro is run when building a source tarball. + +
    • - stage=install +The macro is run when installing from source. + +
    • - stage=render +The macro is run when displaying the help page. +
    + +

    Conditionals such as #ifdef +(see Platform-specific sections) are applied after the +build macros but before the install macros. In some +situations (e.g. installing directly from a source directory without a +tarball, or building a binary package) the above description is not +literally accurate, but authors can rely on the sequence being +build, #ifdef, install, render, with all +stages executed. +

    +

    Code is only run once in each stage, so a \Sexpr[results=rd] +macro can output an \Sexpr macro designed for a later stage, +but not for the current one or any earlier stage. +

    +
  • width, height, fig +These options are currently allowed but ignored. +
+ +

The \RdOpts macro is used to set new defaults for options to apply +to following uses of \Sexpr. +

+

For more details, see the online document +“Parsing Rd files”. +

+
+ + + +

2.13 User-defined macros

+ + + + +

The \newcommand and \renewcommand macros allow new macros +to be defined within an Rd file. These are similar but not identical to +the same-named LaTeX macros. +

+

They each take two arguments which are parsed verbatim. The first is +the name of the new macro including the initial backslash, and the second +is the macro definition. As in LaTeX, \newcommand requires that the +new macro not have been previously defined, whereas \renewcommand +allows existing macros (including all built-in ones) to be replaced. +(As from version 3.2.0, this test is disabled by default, but may +be enabled by setting the environment variable _WARN_DUPLICATE_RD_MACROS_ +to a true value.) +

+

Also as in LaTeX, the new macro may be defined to take arguments, +and numeric placeholders such as #1 are used in the macro +definition. However, unlike LaTeX, the number of arguments is +determined automatically from the highest placeholder number seen in +the macro definition. For example, a macro definition containing +#1 and #3 (but no other placeholders) will define a +three argument macro (whose second argument will be ignored). As in +LaTeX, at most 9 arguments may be defined. If the # +character is followed by a non-digit it will have no special +significance. All arguments to user-defined macros will be parsed as +verbatim text, and simple text-substitution will be used to replace +the place-holders, after which the replacement text will be parsed. +

+

As of R version 3.2.0, a number of macros are defined in the file +share/Rd/macros/system.Rd of the R source or home +directory, and these will normally be available in all .Rd files. +For example, that file contains the definition +

+
\newcommand{\PR}{\Sexpr[results=rd]{tools:::Rd_expr_PR(#1)}}
+
+

which defines \PR to be a single argument macro; then code +(typically used in the NEWS.Rd file) like +

+
\PR{1234}
+
+

will expand to +

+
\Sexpr[results=rd]{tools:::Rd_expr_PR(1234)}
+
+

when parsed. +

+

Some macros that might be of general use are: +

+
\CRANpkg{pkg} + +
+

A package on CRAN +

+
+
\sspace + +
+

A single space (used after a period that does not end a sentence). +

+
+
\doi{numbers} + +
+

A digital object identifier (DOI). +

+
+

See the system.Rd file in share/Rd/macros for more details +and macro definitions, including macros \packageTitle, +\packageDescription, \packageAuthor, \packageMaintainer, +\packageDESCRIPTION and \packageIndices. + + + + + + +

+ +

Packages may also define their own common macros; these would be stored +in an .Rd file in man/macros in the package source and +will be installed into help/macros when the package is installed. +A package may also use the macros from a different package by listing +the other package in the ‘RdMacros’ field in the DESCRIPTION +file. +

+ + +
+ + + +

2.14 Encoding

+ + +

Rd files are text files and so it is impossible to deduce the encoding +they are written in unless ASCII: files with 8-bit characters +could be UTF-8, Latin-1, Latin-9, KOI8-R, EUC-JP, etc. So an +\encoding{} section must be used to specify the encoding if it +is not ASCII. (The \encoding{} section must be on a +line by itself, and in particular one containing no non-ASCII +characters. The encoding declared in the DESCRIPTION file will +be used if none is declared in the file.) The Rd files are +converted to UTF-8 before parsing and so the preferred encoding for the +files themselves is now UTF-8. +

+

Wherever possible, avoid non-ASCII chars in Rd files, and +even symbols such as ‘<’, ‘>’, ‘$’, ‘^’, ‘&’, +‘|’, ‘@’, ‘~’, and ‘*’ outside ‘verbatim’ +environments (since they may disappear in fonts designed to render +text). (Function showNonASCIIfile in package tools can help +in finding non-ASCII bytes in the files.) +

+

For convenience, encoding names ‘latin1’ and ‘latin2’ are +always recognized: these and ‘UTF-8’ are likely to work fairly +widely. However, this does not mean that all characters in UTF-8 will +be recognized, and the coverage of non-Latin characters82 is fairly low. Using LaTeX +inputenx (see ?Rd2pdf in R) will give greater coverage +of UTF-8. +

+

The \enc command (see Insertions) can be used to provide +transliterations which will be used in conversions that do not support +the declared encoding. +

+

The LaTeX conversion converts the file to UTF-8 from the declared +encoding, and includes a +

+
+
\inputencoding{utf8}
+
+ +

command, and this needs to be matched by a suitable invocation of the +\usepackage{inputenc} command. The R utility R +CMD Rd2pdf looks at the converted code and includes the encodings used: +it might for example use +

+
+
\usepackage[utf8]{inputenc}
+
+ +

(Use of utf8 as an encoding requires LaTeX dated 2003/12/01 or +later. Also, the use of Cyrillic characters in ‘UTF-8’ appears to +also need ‘\usepackage[T2A]{fontenc}’, and R CMD Rd2pdf +includes this conditionally on the file t2aenc.def being present +and environment variable _R_CYRILLIC_TEX_ being set.) +

+

Note that this mechanism works best with Latin letters: the coverage of +UTF-8 in LaTeX is quite low. +

+ + +
+ + + +

2.15 Processing documentation files

+ + +

There are several commands to process Rd files from the system command +line. +

+ +

Using R CMD Rdconv one can convert R documentation format to +other formats, or extract the executable examples for run-time testing. +The currently supported conversions are to plain text, HTML and +LaTeX as well as extraction of the examples. +

+ +

R CMD Rd2pdf generates PDF output from documentation in Rd +files, which can be specified either explicitly or by the path to a +directory with the sources of a package. In the latter case, a +reference manual for all documented objects in the package is created, +including the information in the DESCRIPTION files. +

+ + +

R CMD Sweave and R CMD Stangle process vignette-like +documentation files (e.g. Sweave vignettes with extension +‘.Snw’ or ‘.Rnw’, or other non-Sweave vignettes). +R CMD Stangle is used to extract the R code fragments. +

+

The exact usage and a detailed list of available options for all of +these commands can be obtained by running R CMD command +--help, e.g., R CMD Rdconv --help. All available commands can be +listed using R --help (or Rcmd --help under Windows). +

+

All of these work under Windows. You may need to have installed the +the tools to build packages from source as described in the “R +Installation and Administration” manual, although typically all that is +needed is a LaTeX installation. +

+
+ + + +

2.16 Editing Rd files

+ + +

It can be very helpful to prepare .Rd files using a editor which +knows about their syntax and will highlight commands, indent to show the +structure and detect mis-matched braces, and so on. +

+

The system most commonly used for this is some version of +Emacs (including XEmacs) with the ESS +package (http://ess.r-project.org/: it is often is installed with +Emacs but may need to be loaded, or even installed, +separately). +

+

Another is the Eclipse IDE with the Stat-ET plugin +(http://www.walware.de/goto/statet), and (on Windows only) +Tinn-R (http://sourceforge.net/projects/tinn-r/). +

+

People have also used LaTeX mode in a editor, as .Rd files are +rather similar to LaTeX files. +

+

Some R front-ends provide editing support for .Rd files, for +example RStudio (https://rstudio.org/). +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

3 Tidying and profiling R code

+ + + + + + + + +

R code which is worth preserving in a package and perhaps making +available for others to use is worth documenting, tidying up and perhaps +optimizing. The last two of these activities are the subject of this +chapter. +

+
+ + + +

3.1 Tidying R code

+ + +

R treats function code loaded from packages and code entered by users +differently. By default code entered by users has the source code stored +internally, and when the function is listed, the original source is +reproduced. Loading code from a package (by default) discards the +source code, and the function listing is re-created from the parse tree +of the function. +

+

Normally keeping the source code is a good idea, and in particular it +avoids comments being removed from the source. However, we can make +use of the ability to re-create a function listing from its parse tree +to produce a tidy version of the function, for example with consistent +indentation and spaces around operators. If the original source +does not follow the standard format this tidied version can be much +easier to read. +

+

We can subvert the keeping of source in two ways. +

+
    +
  1. The option keep.source can be set to FALSE before the code +is loaded into R. +
  2. The stored source code can be removed by calling the removeSource() +function, for example by + +
    +
    myfun <- removeSource(myfun)
    +
    + +
+ +

In each case if we then list the function we will get the standard +layout. +

+

Suppose we have a file of functions myfuns.R that we want to +tidy up. Create a file tidy.R containing +

+
+
source("myfuns.R", keep.source = FALSE)
+dump(ls(all = TRUE), file = "new.myfuns.R")
+
+ +

and run R with this as the source file, for example by R +--vanilla < tidy.R or by pasting into an R session. Then the file +new.myfuns.R will contain the functions in alphabetical order in +the standard layout. Warning: comments in your functions will be lost. +

+

The standard format provides a good starting point for further tidying. +Although the deparsing cannot do so, we recommend the consistent use of +the preferred assignment operator ‘<-’ (rather than ‘=’) for +assignment. Many package authors use a version of Emacs (on a +Unix-alike or Windows) to edit R code, using the ESS[S] mode of the +ESS Emacs package. See R coding +standards in R Internals for style options within the ESS[S] mode +recommended for the source code of R itself. +

+ + +
+ + + +

3.2 Profiling R code for speed

+ + + +

It is possible to profile R code on Windows and most83 Unix-alike versions of +R. +

+

The command Rprof is used to control profiling, and its help +page can be consulted for full details. Profiling works by recording +at fixed intervals84 (by default every 20 msecs) +which line in which R function is being used, and recording the +results in a file (default Rprof.out in the working directory). +Then the function summaryRprof or the command-line utility +R CMD Rprof Rprof.out can be used to summarize the +activity. +

+

As an example, consider the following code (from Venables & Ripley, +2002, pp. 225–6). +

+
+
library(MASS); library(boot)
+storm.fm <- nls(Time ~ b*Viscosity/(Wt - c), stormer,
+		start = c(b=30.401, c=2.2183))
+st <- cbind(stormer, fit=fitted(storm.fm))
+storm.bf <- function(rs, i) {
+    st$Time <-  st$fit + rs[i]
+    tmp <- nls(Time ~ (b * Viscosity)/(Wt - c), st,
+	       start = coef(storm.fm))
+    tmp$m$getAllPars()
+}
+rs <- scale(resid(storm.fm), scale = FALSE) # remove the mean
+Rprof("boot.out")
+storm.boot <- boot(rs, storm.bf, R = 4999) # slow enough to profile
+Rprof(NULL)
+
+ +

Having run this we can summarize the results by +

+
+
R CMD Rprof boot.out
+
+Each sample represents 0.02 seconds.
+Total run time: 22.52 seconds.
+
+Total seconds: time spent in function and callees.
+Self seconds: time spent in function alone.
+
+
   %       total       %        self
+ total    seconds     self    seconds    name
+ 100.0     25.22       0.2      0.04     "boot"
+  99.8     25.18       0.6      0.16     "statistic"
+  96.3     24.30       4.0      1.02     "nls"
+  33.9      8.56       2.2      0.56     "<Anonymous>"
+  32.4      8.18       1.4      0.36     "eval"
+  31.8      8.02       1.4      0.34     ".Call"
+  28.6      7.22       0.0      0.00     "eval.parent"
+  28.5      7.18       0.3      0.08     "model.frame"
+  28.1      7.10       3.5      0.88     "model.frame.default"
+  17.4      4.38       0.7      0.18     "sapply"
+  15.0      3.78       3.2      0.80     "nlsModel"
+  12.5      3.16       1.8      0.46     "lapply"
+  12.3      3.10       2.7      0.68     "assign"
+ ...
+
+
   %        self        %      total
+  self    seconds     total   seconds    name
+   5.7      1.44       7.5      1.88     "inherits"
+   4.0      1.02      96.3     24.30     "nls"
+   3.6      0.92       3.6      0.92     "$"
+   3.5      0.88      28.1      7.10     "model.frame.default"
+   3.2      0.80      15.0      3.78     "nlsModel"
+   2.8      0.70       9.8      2.46     "qr.coef"
+   2.7      0.68      12.3      3.10     "assign"
+   2.5      0.64       2.5      0.64     ".Fortran"
+   2.5      0.62       7.1      1.80     "qr.default"
+   2.2      0.56      33.9      8.56     "<Anonymous>"
+   2.1      0.54       5.9      1.48     "unlist"
+   2.1      0.52       7.9      2.00     "FUN"
+  ...
+
+ +

This often produces +surprising results and can be used to identify bottlenecks or pieces of +R code that could benefit from being replaced by compiled code. +

+

Two warnings: profiling does impose a small performance penalty, and the +output files can be very large if long runs are profiled at the default +sampling interval. +

+

Profiling short runs can sometimes give misleading results. R from +time to time performs garbage collection to reclaim unused +memory, and this takes an appreciable amount of time which profiling +will charge to whichever function happens to provoke it. It may be +useful to compare profiling code immediately after a call to gc() +with a profiling run without a preceding call to gc. +

+

More detailed analysis of the output can be achieved by the tools in the +CRAN packages proftools and profr: in +particular these allow call graphs to be studied. +

+
+ + + +

3.3 Profiling R code for memory use

+ + + +

Measuring memory use in R code is useful either when the code takes +more memory than is conveniently available or when memory allocation and +copying of objects is responsible for slow code. There are three ways to +profile memory use over time in R code. All three require R to +have been compiled with --enable-memory-profiling, which is not +the default, but is currently used for the OS X and Windows binary +distributions. All can be misleading, for different reasons. +

+

In understanding the memory profiles it is useful to know a little more +about R’s memory allocation. Looking at the results of gc() +shows a division of memory into Vcells used to store the contents +of vectors and Ncells used to store everything else, including +all the administrative overhead for vectors such as type and length +information. In fact the vector contents are divided into two +pools. Memory for small vectors (by default 128 bytes or less) is +obtained in large chunks and then parcelled out by R; memory for +larger vectors is obtained directly from the operating system. +

+

Some memory allocation is obvious in interpreted code, for example, +

+
+
y <- x + 1
+
+ +

allocates memory for a new vector y. Other memory allocation is +less obvious and occurs because R is forced to make good on its +promise of ‘call-by-value’ argument passing. When an argument is +passed to a function it is not immediately copied. Copying occurs (if +necessary) only when the argument is modified. This can lead to +surprising memory use. For example, in the ‘survey’ package we have +

+
+
print.svycoxph <- function (x, ...)
+{
+    print(x$survey.design, varnames = FALSE, design.summaries = FALSE,
+	...)
+    x$call <- x$printcall
+    NextMethod()
+}
+
+ +

It may not be obvious that the assignment to x$call will cause +the entire object x to be copied. This copying to preserve the +call-by-value illusion is usually done by the internal C function +duplicate. +

+

The main reason that memory-use profiling is difficult is garbage +collection. Memory is allocated at well-defined times in an R +program, but is freed whenever the garbage collector happens to run. +

+ + + + + + +
+ + + +

3.3.1 Memory statistics from Rprof

+ + + +

The sampling profiler Rprof described in the previous section can +be given the option memory.profiling=TRUE. It then writes out the +total R memory allocation in small vectors, large vectors, and cons +cells or nodes at each sampling interval. It also writes out the number +of calls to the internal function duplicate, which is called to +copy R objects. summaryRprof provides summaries of this +information. The main reason that this can be misleading is that the +memory use is attributed to the function running at the end of the +sampling interval. A second reason is that garbage collection can make +the amount of memory in use decrease, so a function appears to use +little memory. Running under gctorture helps with both problems: +it slows down the code to effectively increase the sampling frequency +and it makes each garbage collection release a smaller amount of memory. +Changing the memory limits with mem.limits() may also be useful, +to see how the code would run under different memory conditions. +

+
+ + + +

3.3.2 Tracking memory allocations

+ + +

The second method of memory profiling uses a memory-allocation +profiler, Rprofmem(), which writes out a stack trace to an +output file every time a large vector is allocated (with a +user-specified threshold for ‘large’) or a new page of memory is +allocated for the R heap. Summary functions for this output are still +being designed. +

+

Running the example from the previous section with +

+
+
> Rprofmem("boot.memprof",threshold=1000)
+> storm.boot <- boot(rs, storm.bf, R = 4999)
+> Rprofmem(NULL)
+
+ +

shows that apart from some initial and final work in boot there +are no vector allocations over 1000 bytes. +

+
+ + + +

3.3.3 Tracing copies of an object

+ + + +

The third method of memory profiling involves tracing copies made of a +specific (presumably large) R object. Calling tracemem on an +object marks it so that a message is printed to standard output when +the object is copied via duplicate or coercion to another type, +or when a new object of the same size is created in arithmetic +operations. The main reason that this can be misleading is that +copying of subsets or components of an object is not tracked. It may +be helpful to use tracemem on these components. +

+ +

In the example above we can run tracemem on the data frame +st +

+
+
> tracemem(st)
+[1] "<0x9abd5e0>"
+> storm.boot <- boot(rs, storm.bf, R = 4)
+memtrace[0x9abd5e0->0x92a6d08]: statistic boot
+memtrace[0x92a6d08->0x92a6d80]: $<-.data.frame $<- statistic boot
+memtrace[0x92a6d80->0x92a6df8]: $<-.data.frame $<- statistic boot
+memtrace[0x9abd5e0->0x9271318]: statistic boot
+memtrace[0x9271318->0x9271390]: $<-.data.frame $<- statistic boot
+memtrace[0x9271390->0x9271408]: $<-.data.frame $<- statistic boot
+memtrace[0x9abd5e0->0x914f558]: statistic boot
+memtrace[0x914f558->0x914f5f8]: $<-.data.frame $<- statistic boot
+memtrace[0x914f5f8->0x914f670]: $<-.data.frame $<- statistic boot
+memtrace[0x9abd5e0->0x972cbf0]: statistic boot
+memtrace[0x972cbf0->0x972cc68]: $<-.data.frame $<- statistic boot
+memtrace[0x972cc68->0x972cd08]: $<-.data.frame $<- statistic boot
+memtrace[0x9abd5e0->0x98ead98]: statistic boot
+memtrace[0x98ead98->0x98eae10]: $<-.data.frame $<- statistic boot
+memtrace[0x98eae10->0x98eae88]: $<-.data.frame $<- statistic boot
+
+ +

The object is duplicated fifteen times, three times for each of the +R+1 calls to storm.bf. This is surprising, since none of the duplications happen inside nls. Stepping through storm.bf in the debugger shows that all three happen in the line +

+
+
st$Time <- st$fit + rs[i]
+
+ +

Data frames are slower than matrices and this is an example of why. +Using tracemem(st$Viscosity) does not reveal any additional +copying. +

+
+ + + +

3.4 Profiling compiled code

+ + +

Profiling compiled code is highly system-specific, but this section +contains some hints gleaned from various R users. Some methods need +to be different for a compiled executable and for dynamic/shared +libraries/objects as used by R packages. We know of no good way to +profile DLLs on Windows. +

+ + + + + + +
+ + + +

3.4.1 Linux

+ +

Options include using sprof for a shared object, and +oprofile (see http://oprofile.sourceforge.net/) and +perf (see +https://perf.wiki.kernel.org/index.php/Tutorial) for any +executable or shared object. +

+ +

3.4.1.1 sprof

+ +

You can select shared objects to be profiled with sprof by +setting the environment variable LD_PROFILE. For example +

+
+
% setenv LD_PROFILE /path/to/R_HOME/library/stats/libs/stats.so
+R
+... run the boot example
+% sprof /path/to/R_HOME/library/stats/libs/stats.so \
+  /var/tmp/path/to/R_HOME/library/stats/libs/stats.so.profile
+
+Flat profile:
+
+Each sample counts as 0.01 seconds.
+  %   cumulative   self              self     total
+ time   seconds   seconds    calls  us/call  us/call  name
+ 76.19      0.32     0.32        0     0.00           numeric_deriv
+ 16.67      0.39     0.07        0     0.00           nls_iter
+  7.14      0.42     0.03        0     0.00           getListElement
+
+rm /var/tmp/path/to/R_HOME/library/stats/libs/stats.so.profile
+... to clean up ...
+
+ +

It is possible that root access is needed to create the directories used +for the profile data. +

+ +

3.4.1.2 oprofile and operf

+ +

The oprofile project has two modes of operation. In what is +now called ‘legacy’ mode, it is uses a daemon to collect information on +a process (see below). Since version 0.9.8 (August 2012), the preferred +mode is to use operf, so we discuss that first. The modes +differ in how the profiling data is collected: it is analysed by tools +such as opreport and oppannote in both. +

+

Here is an example on x86_64 Linux using R 3.0.2. File +pvec.R contains the part of the examples from pvec in +package parallel: +

+
library(parallel)
+N <- 1e6
+dates <- sprintf('%04d-%02d-%02d', as.integer(2000+rnorm(N)),
+                 as.integer(runif(N, 1, 12)), as.integer(runif(N, 1, 28)))
+system.time(a <- as.POSIXct(dates, format = "%Y-%m-%d"))
+
+

with timings from the final step +

+
   user  system elapsed
+  0.371   0.237   0.612
+
+ +

R-level profiling by Rprof shows +

+
                     self.time self.pct total.time total.pct
+"strptime"                1.70    41.06       1.70     41.06
+"as.POSIXct.POSIXlt"      1.40    33.82       1.42     34.30
+"sprintf"                 0.74    17.87       0.98     23.67
+...
+
+

so the conversion from character to POSIXlt takes most of the +time. +

+

This can be run under operf and analysed by +

+
operf R -f pvec.R
+opreport
+opreport -l /path/to/R_HOME/bin/exec/R
+opannotate --source /path/to/R_HOME/bin/exec/R
+## And for the system time
+opreport -l /lib64/libc.so.6
+
+

The first report shows where (which library etc) the time was spent: +

+
CPU_CLK_UNHALT...|
+  samples|      %|
+------------------
+   166761 99.9161 Rdev
+	CPU_CLK_UNHALT...|
+	  samples|      %|
+	------------------
+	    70586 42.3276 no-vmlinux
+	    56963 34.1585 libc-2.16.so
+	    36922 22.1407 R
+	     1584  0.9499 stats.so
+	      624  0.3742 libm-2.16.so
+...
+
+ +

The rest of the output is voluminous, and only extracts are shown below. +

+

Most of the time within R is spent in +

+
samples  %        image name symbol name
+10397    28.5123  R           R_gc_internal
+5683     15.5848  R           do_sprintf
+3036      8.3258  R           do_asPOSIXct
+2427      6.6557  R           do_strptime
+2421      6.6392  R           Rf_mkCharLenCE
+1480      4.0587  R           w_strptime_internal
+1202      3.2963  R           Rf_qnorm5
+1165      3.1948  R           unif_rand
+675       1.8511  R           mktime0
+617       1.6920  R           makelt
+617       1.6920  R           validate_tm
+584       1.6015  R           day_of_the_week
+...
+
+

opannotate shows that 31% of the time in R is spent in +memory.c, 21% in datetime.c and 7% in Rstrptime.h. +The analysis for libc showed that calls to wcsftime +dominated, so those calls were cached for R 3.0.3: the time spent in +no-vmlinux (the kernel) was reduced dramatically. +

+

On platforms which support it, call graphs can be produced by +opcontrol --callgraph if collected via operf +--callgraph. +

+

The profiling data is by default stored in sub-directory +oprofile_data of the current directory, which can be removed at +the end of the session. +

+

Another example, from sm version 2.2-5.4. The example for +sm.variogram took a long time: +

+
system.time(example(sm.variogram))
+...
+   user  system elapsed
+  5.543   3.202   8.785
+
+ +

including a lot of system time. Profiling just the slow part, the +second plot, showed +

+
+
  samples|      %|
+------------------
+   381845 99.9885 R
+	CPU_CLK_UNHALT...|
+	  samples|      %|
+	------------------
+	   187484 49.0995 sm.so
+	   169627 44.4230 no-vmlinux
+	    12636  3.3092 libgfortran.so.3.0.0
+	     6455  1.6905 R
+
+ +

so the system time was almost all in the Linux kernel. It is possible +to dig deeper if you have a matching uncompressed kernel with debug +symbols to specify via --vmlinux: we did not. +

+

In ‘legacy’ mode oprofile works by running a daemon which +collects information. The daemon must be started as root, e.g. +

+
+
% su
+% opcontrol --no-vmlinux
+% (optional, some platforms) opcontrol --callgraph=5
+% opcontrol --start
+% exit
+
+ +

Then as a user +

+
+
% R
+... run the boot example
+% opcontrol --dump
+% opreport -l /path/to/R_HOME/library/stats/libs/stats.so
+...
+samples  %        symbol name
+1623     75.5939  anonymous symbol from section .plt
+349      16.2552  numeric_deriv
+113       5.2632  nls_iter
+62        2.8878  getListElement
+% opreport -l /path/to/R_HOME/bin/exec/R
+...
+samples  %        symbol name
+76052    11.9912  Rf_eval
+54670     8.6198  Rf_findVarInFrame3
+37814     5.9622  Rf_allocVector
+31489     4.9649  Rf_duplicate
+28221     4.4496  Rf_protect
+26485     4.1759  Rf_cons
+23650     3.7289  Rf_matchArgs
+21088     3.3250  Rf_findFun
+19995     3.1526  findVarLocInFrame
+14871     2.3447  Rf_evalList
+13794     2.1749  R_Newhashpjw
+13522     2.1320  R_gc_internal
+...
+
+ +

Shutting down the profiler and clearing the records needs to be done as +root. +

+ +
+ +
+

+Next: , Previous: , Up: Profiling compiled code   [Contents][Index]

+
+ +

3.4.2 Solaris

+ +

On 64-bit (only) Solaris, the standard profiling tool gprof +collects information from shared objects compiled with -pg. +

+
+ +
+

+Previous: , Up: Profiling compiled code   [Contents][Index]

+
+ +

3.4.3 OS X

+ +

Developers have recommended sample (or Sampler.app, +which is a GUI version), Shark (in version of Xcode +up to those for Snow Leopard), and Instruments (part of +Xcode, see +https://developer.apple.com/library/mac/#documentation/DeveloperTools/Conceptual/InstrumentsUserGuide/Introduction/Introduction.html). +

+ +
+ + + +

4 Debugging

+ +

This chapter covers the debugging of R extensions, starting with the +ways to get useful error information and moving on to how to deal with +errors that crash R. For those who prefer other styles there are +contributed packages such as debug on CRAN +(described in an article in +R-News +3/3). (There are notes from 2002 provided by Roger Peng at +http://www.biostat.jhsph.edu/~rpeng/docs/R-debug-tools.pdf +which provide complementary examples to those given here.) +

+ + + + + + + + +
+ +
+

+Next: , Previous: , Up: Debugging   [Contents][Index]

+
+ +

4.1 Browsing

+ + +

Most of the R-level debugging facilities are based around the +built-in browser. This can be used directly by inserting a call to +browser() into the code of a function (for example, using +fix(my_function) ). When code execution reaches that point in +the function, control returns to the R console with a special prompt. +For example +

+
+
> fix(summary.data.frame) ## insert browser() call after for() loop
+> summary(women)
+Called from: summary.data.frame(women)
+Browse[1]> ls()
+ [1] "digits" "i"      "lbs"    "lw"     "maxsum" "nm"     "nr"     "nv"
+ [9] "object" "sms"    "z"
+Browse[1]> maxsum
+[1] 7
+Browse[1]>
+     height         weight
+ Min.   :58.0   Min.   :115.0
+ 1st Qu.:61.5   1st Qu.:124.5
+ Median :65.0   Median :135.0
+ Mean   :65.0   Mean   :136.7
+ 3rd Qu.:68.5   3rd Qu.:148.0
+ Max.   :72.0   Max.   :164.0
+> rm(summary.data.frame)
+
+ +

At the browser prompt one can enter any R expression, so for example +ls() lists the objects in the current frame, and entering the +name of an object will85 print it. The following commands are +also accepted +

+
    +
  • n + +

    Enter ‘step-through’ mode. In this mode, hitting return executes the +next line of code (more precisely one line and any continuation lines). +Typing c will continue to the end of the current context, e.g. +to the end of the current loop or function. +

    +
  • c + +

    In normal mode, this quits the browser and continues execution, and just +return works in the same way. cont is a synonym. +

    +
  • where + +

    This prints the call stack. For example +

    +
    +
    > summary(women)
    +Called from: summary.data.frame(women)
    +Browse[1]> where
    +where 1: summary.data.frame(women)
    +where 2: summary(women)
    +
    +Browse[1]>
    +
    + +
  • Q + +

    Quit both the browser and the current expression, and return to the +top-level prompt. +

+ +

Errors in code executed at the browser prompt will normally return +control to the browser prompt. Objects can be altered by assignment, +and will keep their changed values when the browser is exited. If +really necessary, objects can be assigned to the workspace from the +browser prompt (by using <<- if the name is not already in +scope). +

+
+ +
+

+Next: , Previous: , Up: Debugging   [Contents][Index]

+
+ +

4.2 Debugging R code

+ + +

Suppose your R program gives an error message. The first thing to +find out is what R was doing at the time of the error, and the most +useful tool is traceback(). We suggest that this is run whenever +the cause of the error is not immediately obvious. Daily, errors are +reported to the R mailing lists as being in some package when +traceback() would show that the error was being reported by some +other package or base R. Here is an example from the regression +suite. +

+
+
> success <- c(13,12,11,14,14,11,13,11,12)
+> failure <- c(0,0,0,0,0,0,0,2,2)
+> resp <- cbind(success, failure)
+> predictor <- c(0, 5^(0:7))
+> glm(resp ~ 0+predictor, family = binomial(link="log"))
+Error: no valid set of coefficients has been found: please supply starting values
+> traceback()
+3: stop("no valid set of coefficients has been found: please supply
+	 starting values", call. = FALSE)
+2: glm.fit(x = X, y = Y, weights = weights, start = start, etastart = etastart,
+       mustart = mustart, offset = offset, family = family, control = control,
+       intercept = attr(mt, "intercept") > 0)
+1: glm(resp ~ 0 + predictor, family = binomial(link ="log"))
+
+ +

The calls to the active frames are given in reverse order (starting with +the innermost). So we see the error message comes from an explicit +check in glm.fit. (traceback() shows you all the lines of +the function calls, which can be limited by setting option +"deparse.max.lines".) +

+

Sometimes the traceback will indicate that the error was detected inside +compiled code, for example (from ?nls) +

+
+
Error in nls(y ~ a + b * x, start = list(a = 0.12345, b = 0.54321), trace = TRUE) :
+	step factor 0.000488281 reduced below ‘minFactor’ of 0.000976563
+>  traceback()
+2: .Call(R_nls_iter, m, ctrl, trace)
+1: nls(y ~ a + b * x, start = list(a = 0.12345, b = 0.54321), trace = TRUE)
+
+ +

This will be the case if the innermost call is to .C, +.Fortran, .Call, .External or .Internal, but +as it is also possible for such code to evaluate R expressions, this +need not be the innermost call, as in +

+
+
> traceback()
+9: gm(a, b, x)
+8: .Call(R_numeric_deriv, expr, theta, rho, dir)
+7: numericDeriv(form[[3]], names(ind), env)
+6: getRHS()
+5: assign("rhs", getRHS(), envir = thisEnv)
+4: assign("resid", .swts * (lhs - assign("rhs", getRHS(), envir = thisEnv)),
+       envir = thisEnv)
+3: function (newPars)
+   {
+       setPars(newPars)
+       assign("resid", .swts * (lhs - assign("rhs", getRHS(), envir = thisEnv)),
+	   envir = thisEnv)
+       assign("dev", sum(resid^2), envir = thisEnv)
+       assign("QR", qr(.swts * attr(rhs, "gradient")), envir = thisEnv)
+       return(QR$rank < min(dim(QR$qr)))
+   }(c(-0.00760232418963883, 1.00119632515036))
+2: .Call(R_nls_iter, m, ctrl, trace)
+1: nls(yeps ~ gm(a, b, x), start = list(a = 0.12345, b = 0.54321))
+
+ +

Occasionally traceback() does not help, and this can be the case +if S4 method dispatch is involved. Consider the following example +

+
+
> xyd <- new("xyloc", x=runif(20), y=runif(20))
+Error in as.environment(pkg) : no item called "package:S4nswv"
+on the search list
+Error in initialize(value, ...) : S language method selection got
+an error when called from internal dispatch for function ‘initialize’
+> traceback()
+2: initialize(value, ...)
+1: new("xyloc", x = runif(20), y = runif(20))
+
+ +

which does not help much, as there is no call to as.environment +in initialize (and the note “called from internal dispatch” +tells us so). In this case we searched the R sources for the quoted +call, which occurred in only one place, +methods:::.asEnvironmentPackage. So now we knew where the +error was occurring. (This was an unusually opaque example.) +

+

The error message +

+
+
evaluation nested too deeply: infinite recursion / options(expressions=)?
+
+ +

can be hard to handle with the default value (5000). Unless you know +that there actually is deep recursion going on, it can help to set +something like +

+
+
options(expressions=500)
+
+ +

and re-run the example showing the error. +

+

Sometimes there is warning that clearly is the precursor to some later +error, but it is not obvious where it is coming from. Setting +options(warn = 2) (which turns warnings into errors) can help here. +

+

Once we have located the error, we have some choices. One way to proceed +is to find out more about what was happening at the time of the crash by +looking a post-mortem dump. To do so, set + +options(error=dump.frames) and run the code again. Then invoke +debugger() and explore the dump. Continuing our example: +

+
+
> options(error = dump.frames)
+> glm(resp ~ 0 + predictor, family = binomial(link ="log"))
+Error: no valid set of coefficients has been found: please supply starting values
+
+ +

which is the same as before, but an object called last.dump has +appeared in the workspace. (Such objects can be large, so remove it +when it is no longer needed.) We can examine this at a later time by +calling the function debugger. + +

+
+
> debugger()
+Message:  Error: no valid set of coefficients has been found: please supply starting values
+Available environments had calls:
+1: glm(resp ~ 0 + predictor, family = binomial(link = "log"))
+2: glm.fit(x = X, y = Y, weights = weights, start = start, etastart = etastart, mus
+3: stop("no valid set of coefficients has been found: please supply starting values
+Enter an environment number, or 0 to exit  Selection:
+
+ +

which gives the same sequence of calls as traceback, but in +outer-first order and with only the first line of the call, truncated to +the current width. However, we can now examine in more detail what was +happening at the time of the error. Selecting an environment opens the +browser in that frame. So we select the function call which spawned the +error message, and explore some of the variables (and execute two +function calls). +

+
+
Enter an environment number, or 0 to exit  Selection: 2
+Browsing in the environment with call:
+   glm.fit(x = X, y = Y, weights = weights, start = start, etas
+Called from: debugger.look(ind)
+Browse[1]> ls()
+ [1] "aic"        "boundary"   "coefold"    "control"    "conv"
+ [6] "dev"        "dev.resids" "devold"     "EMPTY"      "eta"
+[11] "etastart"   "family"     "fit"        "good"       "intercept"
+[16] "iter"       "linkinv"    "mu"         "mu.eta"     "mu.eta.val"
+[21] "mustart"    "n"          "ngoodobs"   "nobs"       "nvars"
+[26] "offset"     "start"      "valideta"   "validmu"    "variance"
+[31] "varmu"      "w"          "weights"    "x"          "xnames"
+[36] "y"          "ynames"     "z"
+Browse[1]> eta
+	    1             2             3             4             5
+ 0.000000e+00 -2.235357e-06 -1.117679e-05 -5.588393e-05 -2.794197e-04
+	    6             7             8             9
+-1.397098e-03 -6.985492e-03 -3.492746e-02 -1.746373e-01
+Browse[1]> valideta(eta)
+[1] TRUE
+Browse[1]> mu
+	1         2         3         4         5         6         7         8
+1.0000000 0.9999978 0.9999888 0.9999441 0.9997206 0.9986039 0.9930389 0.9656755
+	9
+0.8397616
+Browse[1]> validmu(mu)
+[1] FALSE
+Browse[1]> c
+Available environments had calls:
+1: glm(resp ~ 0 + predictor, family = binomial(link = "log"))
+2: glm.fit(x = X, y = Y, weights = weights, start = start, etastart = etastart
+3: stop("no valid set of coefficients has been found: please supply starting v
+
+Enter an environment number, or 0 to exit  Selection: 0
+> rm(last.dump)
+
+ +

Because last.dump can be looked at later or even in another R +session, post-mortem debugging is possible even for batch usage of R. +We do need to arrange for the dump to be saved: this can be done either +using the command-line flag --save to save the workspace at the +end of the run, or via a setting such as +

+
+
> options(error = quote({dump.frames(to.file=TRUE); q()}))
+
+ +

See the help on dump.frames for further options and a worked +example. +

+ +

An alternative error action is to use the function recover(): +

+
+
> options(error = recover)
+> glm(resp ~ 0 + predictor, family = binomial(link = "log"))
+Error: no valid set of coefficients has been found: please supply starting values
+
+Enter a frame number, or 0 to exit
+
+1: glm(resp ~ 0 + predictor, family = binomial(link = "log"))
+2: glm.fit(x = X, y = Y, weights = weights, start = start, etastart = etastart
+
+Selection:
+
+ +

which is very similar to dump.frames. However, we can examine +the state of the program directly, without dumping and re-loading the +dump. As its help page says, recover can be routinely used as +the error action in place of dump.calls and dump.frames, +since it behaves like dump.frames in non-interactive use. +

+ + +

Post-mortem debugging is good for finding out exactly what went wrong, +but not necessarily why. An alternative approach is to take a closer +look at what was happening just before the error, and a good way to do +that is to use debug. This inserts a call to the browser +at the beginning of the function, starting in step-through mode. So in +our example we could use +

+
+
> debug(glm.fit)
+> glm(resp ~ 0 + predictor, family = binomial(link ="log"))
+debugging in: glm.fit(x = X, y = Y, weights = weights, start = start, etastart = etastart,
+    mustart = mustart, offset = offset, family = family, control = control,
+    intercept = attr(mt, "intercept") > 0)
+debug: {
+## lists the whole function
+Browse[1]>
+debug: x <- as.matrix(x)
+...
+Browse[1]> start
+[1] -2.235357e-06
+debug: eta <- drop(x %*% start)
+Browse[1]> eta
+	    1             2             3             4             5
+ 0.000000e+00 -2.235357e-06 -1.117679e-05 -5.588393e-05 -2.794197e-04
+	    6             7             8             9
+-1.397098e-03 -6.985492e-03 -3.492746e-02 -1.746373e-01
+Browse[1]>
+debug: mu <- linkinv(eta <- eta + offset)
+Browse[1]> mu
+	1         2         3         4         5         6         7         8
+1.0000000 0.9999978 0.9999888 0.9999441 0.9997206 0.9986039 0.9930389 0.9656755
+	9
+0.8397616
+
+ +

(The prompt Browse[1]> indicates that this is the first level of +browsing: it is possible to step into another function that is itself +being debugged or contains a call to browser().) +

+

debug can be used for hidden functions and S3 methods by +e.g. debug(stats:::predict.Arima). (It cannot be used for S4 +methods, but an alternative is given on the help page for debug.) +Sometimes you want to debug a function defined inside another function, +e.g. the function arimafn defined inside arima. To do so, +set debug on the outer function (here arima) and +step through it until the inner function has been defined. Then +call debug on the inner function (and use c to get out of +step-through mode in the outer function). +

+ +

To remove debugging of a function, call undebug with the argument +previously given to debug; debugging otherwise lasts for the rest +of the R session (or until the function is edited or otherwise +replaced). +

+ +

trace can be used to temporarily insert debugging code into a +function, for example to insert a call to browser() just before +the point of the error. To return to our running example +

+
+
## first get a numbered listing of the expressions of the function
+> page(as.list(body(glm.fit)), method="print")
+> trace(glm.fit, browser, at=22)
+Tracing function "glm.fit" in package "stats"
+[1] "glm.fit"
+> glm(resp ~ 0 + predictor, family = binomial(link ="log"))
+Tracing glm.fit(x = X, y = Y, weights = weights, start = start,
+   etastart = etastart,  .... step 22
+Called from: eval(expr, envir, enclos)
+Browse[1]> n
+## and single-step from here.
+> untrace(glm.fit)
+
+

For your own functions, it may be as easy to use fix to insert +temporary code, but trace can help with functions in a namespace +(as can fixInNamespace). Alternatively, use +trace(,edit=TRUE) to insert code visually. +

+ +
+ + + +

4.3 Checking memory access

+ +

Errors in memory allocation and reading/writing outside arrays are very +common causes of crashes (e.g., segfaults) on some machines. Often +the crash appears long after the invalid memory access: in particular +damage to the structures which R itself has allocated may only become +apparent at the next garbage collection (or even at later garbage +collections after objects have been deleted). +

+

Note that memory access errors may be seen with LAPACK, BLAS, OpenMP and +Java-using packages: some at least of these seem to be intentional, and +some are related to passing characters to Fortran. +

+

Some of these tools can detect mismatched allocation and deallocation. +C++ programmers should note that memory allocated by new [] must +be freed by delete [], other uses of new by delete, +and memory allocated by malloc, calloc and realloc +by free. Some platforms will tolerate mismatches (perhaps with +memory leaks) but others will segfault. +

+ + + + + + + + + + +
+ + + +

4.3.1 Using gctorture

+ + +

We can help to detect memory problems in R objects earlier by running +garbage collection as often as possible. This is achieved by +gctorture(TRUE), which as described on its help page +

+
+

Provokes garbage collection on (nearly) every memory allocation. +Intended to ferret out memory protection bugs. Also makes R run +very slowly, unfortunately. +

+ +

The reference to ‘memory protection’ is to missing C-level calls to +PROTECT/UNPROTECT (see Garbage Collection) which if +missing allow R objects to be garbage-collected when they are still +in use. But it can also help with other memory-related errors. +

+

Normally running under gctorture(TRUE) will just produce a crash +earlier in the R program, hopefully close to the actual cause. See +the next section for how to decipher such crashes. +

+

It is possible to run all the examples, tests and vignettes covered by +R CMD check under gctorture(TRUE) by using the option +--use-gct. +

+

The function gctorture2 provides more refined control over the GC +torture process. Its arguments step, wait and +inhibit_release are documented on its help page. Environment +variables can also be used at the start of the R session to turn on +GC torture: R_GCTORTURE corresponds to the step argument to +gctorture2, R_GCTORTURE_WAIT to wait, and +R_GCTORTURE_INHIBIT_RELEASE to inhibit_release. +

+

If R is configured with --enable-strict-barrier then a +variety of tests for the integrity of the write barrier are enabled. In +addition tests to help detect protect issues are enabled: +

+
    +
  • All GCs are full GCs. + +
  • New nodes in small node pages are marked as NEWSXP on creation. + +
  • After a GC all free nodes that are not of type NEWSXP are marked +as type FREESXP and their previous type is recorded. + +
  • Most calls to accessor functions check their SEXP inputs and +SEXP outputs and signal an error if a FREESXP is found. +The address of the node and the old type are included in the error +message. + +
+ +

R CMD check --use-gct can be set to use +gctorture2(n) rather than gctorture(TRUE) by setting +environment variable _R_CHECK_GCT_N_ to a positive integer value +to be used as n. +

+

Used with a debugger and with gctorture or gctorture2 this +mechanism can be helpful in isolating memory protect problems. +

+ +
+ + + +

4.3.2 Using valgrind

+ +

If you have access to Linux on a common CPU type or supported versions +of OS X86 you can use +valgrind (http://www.valgrind.org/, pronounced to rhyme +with ‘tinned’) to check for possible problems. To run some examples +under valgrind use something like +

+
+
R -d valgrind --vanilla < mypkg-Ex.R
+R -d "valgrind --tool=memcheck --leak-check=full" --vanilla < mypkg-Ex.R
+
+ +

where mypkg-Ex.R is a set of examples, e.g. the file created in +mypkg.Rcheck by R CMD check. Occasionally this reports +memory reads of ‘uninitialised values’ that are the result of compiler +optimization, so can be worth checking under an unoptimized compile: for +maximal information use a build with debugging symbols. We know there +will be some small memory leaks from readline and R itself — +these are memory areas that are in use right up to the end of the R +session. Expect this to run around 20x slower than without +valgrind, and in some cases much slower than that. Several +versions of valgrind were not happy with some optimized BLASes +that use CPU-specific instructions so you may need to build a +version of R specifically to use with valgrind. +

+

On platforms where valgrind is installed you can build a version +of R with extra instrumentation to help valgrind detect errors +in the use of memory allocated from the R heap. The +configure option is +--with-valgrind-instrumentation=level, where level +is 0, 1 or 2. Level 0 is the default and does not add any anything. +Level 1 will detect some uses87 of uninitialised memory and has little impact on speed +(compared to level 0). Level 2 will detect many other memory-use +bugs88 but make R much slower when running under +valgrind. Using this in conjunction with gctorture can be +even more effective (and even slower). +

+

An example of valgrind output is +

+
==12539== Invalid read of size 4
+==12539==    at 0x1CDF6CBE: csc_compTr (Mutils.c:273)
+==12539==    by 0x1CE07E1E: tsc_transpose (dtCMatrix.c:25)
+==12539==    by 0x80A67A7: do_dotcall (dotcode.c:858)
+==12539==    by 0x80CACE2: Rf_eval (eval.c:400)
+==12539==    by 0x80CB5AF: R_execClosure (eval.c:658)
+==12539==    by 0x80CB98E: R_execMethod (eval.c:760)
+==12539==    by 0x1B93DEFA: R_standardGeneric (methods_list_dispatch.c:624)
+==12539==    by 0x810262E: do_standardGeneric (objects.c:1012)
+==12539==    by 0x80CAD23: Rf_eval (eval.c:403)
+==12539==    by 0x80CB2F0: Rf_applyClosure (eval.c:573)
+==12539==    by 0x80CADCC: Rf_eval (eval.c:414)
+==12539==    by 0x80CAA03: Rf_eval (eval.c:362)
+==12539==  Address 0x1C0D2EA8 is 280 bytes inside a block of size 1996 alloc'd
+==12539==    at 0x1B9008D1: malloc (vg_replace_malloc.c:149)
+==12539==    by 0x80F1B34: GetNewPage (memory.c:610)
+==12539==    by 0x80F7515: Rf_allocVector (memory.c:1915)
+...
+
+

This example is from an instrumented version of R, while tracking +down a bug in the Matrix package in 2006. The first line +indicates that R has tried to read 4 bytes from a memory address that +it does not have access to. This is followed by a C stack trace showing +where the error occurred. Next is a description of the memory that was +accessed. It is inside a block allocated by malloc, called from +GetNewPage, that is, in the internal R heap. Since this +memory all belongs to R, valgrind would not (and did not) +detect the problem in an uninstrumented build of R. In this example +the stack trace was enough to isolate and fix the bug, which was in +tsc_transpose, and in this example running under +gctorture() did not provide any additional information. When the +stack trace is not sufficiently informative the option +--db-attach=yes to valgrind may be helpful. This starts +a post-mortem debugger (by default gdb) so that variables in the +C code can be inspected (see Inspecting R objects). +

+

valgrind is good at spotting the use of uninitialized values: +use option --track-origins=yes to show where these originated +from. What it cannot detect is the misuse of arrays allocated on the +stack: this includes C automatic variables and some89 +Fortran arrays. +

+

It is possible to run all the examples, tests and vignettes covered by +R CMD check under valgrind by using the option +--use-valgrind. If you do this you will need to select the +valgrind options some other way, for example by having a +~/.valgrindrc file containing +

+
+
--leak-check=full
+--track-origins=yes
+
+ +

or setting the environment variable VALGRIND_OPTS. +

+

On OS X you may need to ensure that debugging symbols are made available +(so valgrind reports line numbers in files). This can usually +be done with the valgrind option --dsymutil=yes to +ask for the symbols to be dumped when the .so file is loaded. +This will not work where packages are installed into a system area (such +as the R.framework) and can be slow. Installing packages with +R CMD INSTALL --dsym installs the dumped symbols. (This can +also be done by setting environment variable PKG_MAKE_DSYM to a +non-empty value before the INSTALL.) +

+

This section has described the use of memtest, the default +(and most useful) of valgrind’s tools. There are others +described in its documentation: helgrind can be useful for +threaded programs. +

+
+ + + +

4.3.3 Using the Address Sanitizer

+ +

AddressSanitizer (‘ASan’) is a tool with similar aims to the +memory checker in valgrind. It is available with suitable +builds90 of gcc and clang on common +Linux and OS X platforms. See +http://clang.llvm.org/docs/UsersManual.html#controlling-code-generation, +http://clang.llvm.org/docs/AddressSanitizer.html and +https://code.google.com/p/address-sanitizer/. +

+

More thorough checks of C++ code are done if the C++ library has been +‘annotated’: at the time of writing this applied to std::vector +in libc++ for use with clang and gives rise to +‘container-overflow’ reports. +

+

It requires code to have been compiled and linked with +-fsanitize=address and compiling with -fno-omit-frame-pointer +will give more legible reports. It has a runtime penalty of 2–3x, +extended compilation times and uses substantially more memory, often +1–2GB, at run time. On 64-bit platforms it reserves (but does not +allocate) 16–20TB of virtual memory: restrictive shell settings can +cause problems. +

+

By comparison with valgrind, ASan can +detect misuse of stack and global variables but not the use of +uninitialized memory. +

+

Recent versions return symbolic addresses for the location of the error +provided llvm-symbolizer91 is on the path: if it is available but not +on the path or has been renamed92, one can use an +environment variable, e.g. +

+
+
ASAN_SYMBOLIZER_PATH=/path/to/llvm-symbolizer
+
+ +

An alternative is to pipe the output through +asan_symbolize.py93 and perhaps +then (for compiled C++ code) c++filt. (On OS X, you may need +to run dsymutil to get line-number reports.) +

+

The simplest way to make use of this is to build a version of R with +something like +

+
+
CC="gcc -std=gnu99 -fsanitize=address"
+CFLAGS="-fno-omit-frame-pointer -g -O2 -Wall -pedantic -mtune=native"
+
+ +

which will ensure that the libasan run-time library is compiled +into the R executable. However this check can be enabled on a +per-package basis by using a ~/.R/Makevars file like +

+
CC = gcc-4.9 -std=gnu99 -fsanitize=address -fno-omit-frame-pointer
+CXX = g++-4.9 -fsanitize=address -fno-omit-frame-pointer
+F77 = gfortran-4.9 -fsanitize=address
+FC = gfortran-4.9 -fsanitize=address
+
+

(Note that -fsanitize=address has to be part of the compiler +specification to ensure it is used for linking. These settings will not +be honoured by packages which ignore ~/.R/Makevars.) It will +be necessary to build R with +

+
+
MAIN_LDFLAGS = -fsanitize=address
+
+ +

to link the runtime libraries into the R executable if it was not +specified as part of ‘CC’ when R was built. +

+

For options available via the environment variable +ASAN_OPTIONS see +https://code.google.com/p/address-sanitizer/wiki/Flags#Run-time_flags. +With gcc additional control is available via the +--params flag: see its man page. +

+

For more detailed information on an error, R can be run under a +debugger with a breakpoint set before the address sanitizer report is +produced: for gdb or lldb you could use +

+
break __asan_report_error
+
+

(See +https://code.google.com/p/address-sanitizer/wiki/AddressSanitizer#gdb.) +

+ + + + +
+ + + +

4.3.3.1 Using the Leak Sanitizer

+ +

For x86_64 Linux there is a leak sanitizer, ‘LSan’: see +https://code.google.com/p/address-sanitizer/wiki/LeakSanitizer. +This is available on recent versions of gcc and clang, and +where available is compiled in as part of ASan. +

+

One way to invoke this from an ASan-enabled build is by the environment +variable +

+
+
ASAN_OPTIONS='detect_leaks=1'
+
+

However, this was made the default for clang 3.5 and +gcc 5.1.0. +

+

When LSan is enabled, leaks give the process a failure error status (by +default 23). For an R package this means the R process, +and as the parser retains some memory to the end of the process, if R +itself was built against ASan, all runs will have a failure error status +(which may include running R as part of building R itself). +

+

To disable both this and some strict checking use +

+
+
setenv ASAN_OPTIONS ‘alloc_dealloc_mismatch=0:detect_leaks=0:detect_odr_violation=0’
+
+ +

LSan also has a ‘stand-alone’ mode where it is compiled in using +-fsanitize=leak and avoids the run-time overhead of ASan. +

+
+ + + +

4.3.4 Using the Undefined Behaviour Sanitizer

+ +

‘Undefined behaviour’ is where the language standard does not require +particular behaviour from the compiler. Examples include division by +zero (where for doubles R requires the +ISO/IEC 60559 behaviour but C/C++ do not), use +of zero-length arrays, shifts too far for signed types (e.g. int +x, y; y = x << 31;), out-of-range coercion, invalid C++ casts and +mis-alignment. Not uncommon examples of out-of-range coercion in R +packages are attempts to coerce a NaN or infinity to type +int or NA_INTEGER to an unsigned type such as +size_t. Also common is y[x - 1] forgetting that x +might be NA_INTEGER. +

+

‘UBSanitizer’ is a tool for C/C++ source code selected by +-fsanitize=undefined in suitable builds of clang, and +GCC as from 4.9.0. Its (main) runtime library is linked into each +package’s DLL, so it is less often needed to be included in +MAIN_LDFLAGS. +

+

Some versions have greatly increased compilation times on a few +files94. +

+

This sanitizer can be combined with the Address Sanitizer by +-fsanitize=undefined,address (where both are supported). +

+

Finer control of what is checked can be achieved by other options: for +clang see +http://clang.llvm.org/docs/UsersManual.html#controlling-code-generation.95 +The current set for clang is (on a single line): +

+
-fsanitize=alignment,bool,bounds,enum,float-cast-overflow,
+float-divide-by-zero,function,integer-divide-by-zero,non-null-attribute,
+null,object-size,return,returns-nonnull-attribute,shift,
+signed-integer-overflow,unreachable,vla-bound,vptr
+
+ +

a subset of which could be combined with address, or use something +like +

+
+
-fsanitize=undefined -fno-sanitize=float-divide-by-zero
+
+ +

(function, return and vptr apply only to C++). In +addition, +

+
-fsanitize=unsigned-integer-overflow
+
+

is available as a separate option in some versions of clang +(not enabled by -fsanitize=undefined). +

+

clang 3.5 and later may need +

+
+
-fsanitize=undefined -fno-sanitize=float-divide-by-zero,vptr
+
+ +

for C++ code (in CXX and CXX1X) as the run-time library +for vptr needs to be linked into the main R executable (and +that would need to be linked by clang++, not clang: you +could try building R with something like +

+
MAIN_LD="clang++ -fsanitize=undefined"
+R_OPENMP_CFLAGS="-fopenmp=libomp"
+
+

or add -lclang_rt.asan_cxx-x86_6496 or similar to LD_FLAGS). +

+

See https://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html (or +the manual for your version of GCC, installed or via +https://gcc.gnu.org/onlinedocs/) for the options supported by +GCC: 5.2.0 supports +

+
-fsanitize=alignment,bool,bounds,enum,float-cast-overflow,
+integer-divide-by-zero,non-null-attribute,null,object-size,
+return,returns-nonnull-attribute,shift,signed-integer-overflow,
+unreachable,vla-bound,vptr
+
+

with +

+
-fsanitize=float-divide-by-zero
+
+

as a separate option not enabled by -fsanitize=undefined (and not +desirable for R uses). At the time of writing the object-size +and vptr checks produced many warnings on GCC’s own C++ headers, +so should be disabled. +

+ +

Other useful flags include +

+
-no-fsanitize-recover
+
+ +

which causes the first report to be fatal (it always is for the +unreachable and return suboptions). For more detailed +information on where the runtime error occurs, R can be run under a +debugger with a breakpoint set before the sanitizer report is produced: +for gdb or lldb you could use +

+
break __ubsan_handle_float_cast_overflow
+break __ubsan_handle_float_cast_overflow_abort
+
+

or similar (there are handlers for each type of undefined behaviour). +

+

There are also the compiler flags -fcatch-undefined-behavior +and -ftrapv, said to be more reliable in clang than +gcc. +

+

For more details on the topic see +http://blog.regehr.org/archives/213 and +http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html +(which has 3 parts). +

+ +
+ + + +

4.3.5 Other analyses with ‘clang’

+ +

Recent versions of clang on ‘x86_64’ Linux have +‘ThreadSanitizer’ (https://code.google.com/p/thread-sanitizer/), +a ‘data race detector for C/C++ programs’, and ‘MemorySanitizer’ +(http://clang.llvm.org/docs/MemorySanitizer.html, +https://code.google.com/p/memory-sanitizer/wiki/MemorySanitizer) +for the detection of uninitialized memory. Both are based on and +provide similar functionality to tools in valgrind. +

+

clang has a ‘Static Analyser’ which can be run on the source +files during compilation: see http://clang-analyzer.llvm.org/. +

+
+ + + +

4.3.6 Using ‘Dr. Memory’

+ +

‘Dr. Memory’ from http://www.drmemory.org/ is a memory checker +for (currently) 32-bit Windows, Linux and OS X with similar aims to +valgrind. It works with unmodified executables97 +and detects memory access errors, uninitialized reads and memory leaks. +

+
+ + + +

4.3.7 Fortran array bounds checking

+ +

Most of the Fortran compilers used with R allow code to be compiled +with checking of array bounds: for example gfortran has option +-fbounds-check and Solaris Studio has -C. This will +give an error when the upper or lower bound is exceeded, e.g. +

+
At line 97 of file .../src/appl/dqrdc2.f
+Fortran runtime error: Index ‘1’ of dimension 1 of array ‘x’ above upper bound of 0
+
+ +

One does need to be aware that lazy programmers often specify Fortran +dimensions as 1 rather than * or a real bound and these +will be reported. +

+

It is easy to arrange to use this check on just the code in your +package: add to ~/.R/Makevars something like (for +gfortran) +

+
FCFLAGS = -g -O2 -mtune=native -fbounds-check
+FFLAGS = -g -O2 -mtune=native -fbounds-check
+
+ +

when you run R CMD check. +

+

This may report incorrectly errors with the way that Fortran character +variables are passed, particularly when Fortran subroutines are called +from C code. This may include the use of BLAS and LAPACK subroutines in +R, so it is not advisable to build R itself with bounds checking +(and may not even be possible as these subroutines are called during the +R build). +

+ +
+ +
+

+Previous: , Up: Debugging   [Contents][Index]

+
+ +

4.4 Debugging compiled code

+ + + +

Sooner or later programmers will be faced with the need to debug +compiled code loaded into R. This section is geared to platforms +using gdb with code compiled by gcc, but similar things +are possible with other debuggers such as lldb +(http://lldb.llvm.org/, used on OS X) and Sun’s dbx: +some debuggers have graphical front-ends available. +

+

Consider first ‘crashes’, that is when R terminated unexpectedly with +an illegal memory access (a ‘segfault’ or ‘bus error’), illegal +instruction or similar. Unix-alike versions of R use a signal +handler which aims to give some basic information. For example +

+
+
 *** caught segfault ***
+address 0x20000028, cause ‘memory not mapped’
+
+Traceback:
+ 1: .identC(class1[[1]], class2)
+ 2: possibleExtends(class(sloti), classi, ClassDef2 = getClassDef(classi,
+where = where))
+ 3: validObject(t(cu))
+ 4: stopifnot(validObject(cu <- as(tu, "dtCMatrix")), validObject(t(cu)),
+validObject(t(tu)))
+
+Possible actions:
+1: abort (with core dump)
+2: normal R exit
+3: exit R without saving workspace
+4: exit R saving workspace
+Selection: 3
+
+ +

Since the R process may be damaged, the only really safe options are +the first or third. (Note that a core dump is only produced where +enabled: a common default in a shell is to limit its size to 0, thereby +disabling it.) +

+

A fairly common cause of such crashes is a package which uses .C +or .Fortran and writes beyond (at either end) one of the +arguments it is passed. As from R 3.0.0 there is a good way to +detect this: using options(CBoundsCheck = TRUE) (which can be +selected via the environment variable R_C_BOUNDS_CHECK=yes) +changes the way .C and .Fortran work to check if the +compiled code writes in the 64 bytes at either end of an argument. +

+

Another cause of a ‘crash’ is to overrun the C stack. R tries to +track that in its own code, but it may happen in third-party compiled +code. For modern POSIX-compliant OSes R can safely catch that and +return to the top-level prompt, so one gets something like +

+
+
> .C("aaa")
+Error: segfault from C stack overflow
+>
+
+ +

However, C stack overflows are fatal under Windows and normally defeat +attempts at debugging on that platform. Further, the size of the stack +is set when R is compiled, whereas on POSIX OSes it can be set in the +shell from which R is launched. +

+

If you have a crash which gives a core dump you can use something like +

+
+
gdb /path/to/R/bin/exec/R core.12345
+
+ +

to examine the core dump. If core dumps are disabled or to catch errors +that do not generate a dump one can run R directly under a debugger +by for example +

+
+
$ R -d gdb --vanilla
+...
+gdb> run
+
+ +

at which point R will run normally, and hopefully the debugger will +catch the error and return to its prompt. This can also be used to +catch infinite loops or interrupt very long-running code. For a simple +example +

+
+
> for(i in 1:1e7) x <- rnorm(100)
+[hit Ctrl-C]
+Program received signal SIGINT, Interrupt.
+0x00397682 in _int_free () from /lib/tls/libc.so.6
+(gdb) where
+#0  0x00397682 in _int_free () from /lib/tls/libc.so.6
+#1  0x00397eba in free () from /lib/tls/libc.so.6
+#2  0xb7cf2551 in R_gc_internal (size_needed=313)
+    at /users/ripley/R/svn/R-devel/src/main/memory.c:743
+#3  0xb7cf3617 in Rf_allocVector (type=13, length=626)
+    at /users/ripley/R/svn/R-devel/src/main/memory.c:1906
+#4  0xb7c3f6d3 in PutRNGstate ()
+    at /users/ripley/R/svn/R-devel/src/main/RNG.c:351
+#5  0xb7d6c0a5 in do_random2 (call=0x94bf7d4, op=0x92580e8, args=0x9698f98,
+    rho=0x9698f28) at /users/ripley/R/svn/R-devel/src/main/random.c:183
+...
+
+ +

In many cases it is possible to attach a debugger to a running process: +this is helpful if an alternative front-end is in use or to investigate +a task that seems to be taking far too long. This is done by something +like +

+
+
gdb -p pid
+
+ +

where pid is the id of the R executable or front-end. +This stops the process so its state can be examined: use continue +to resume execution. +

+

Some “tricks” worth knowing follow: +

+ + + + + +
+ + + +

4.4.1 Finding entry points in dynamically loaded code

+ +

Under most compilation environments, compiled code dynamically loaded +into R cannot have breakpoints set within it until it is loaded. To +use a symbolic debugger on such dynamically loaded code under +Unix-alikes use +

+
    +
  • Call the debugger on the R executable, for example by R -d gdb. +
  • Start R. +
  • At the R prompt, use dyn.load or library to load your +shared object. +
  • Send an interrupt signal. This will put you back to the debugger +prompt. +
  • Set the breakpoints in your code. +
  • Continue execution of R by typing signal 0RET. +
+ +

Under Windows signals may not be able to be used, and if so the procedure is +more complicated. See the rw-FAQ and +www.stats.uwo.ca/faculty/murdoch/software/debuggingR/gdb.shtml. +

+ +
+ + + +

4.4.2 Inspecting R objects when debugging

+ + +

The key to inspecting R objects from compiled code is the function +PrintValue(SEXP s) which uses the normal R printing +mechanisms to print the R object pointed to by s, or the safer +version R_PV(SEXP s) which will only print ‘objects’. +

+

One way to make use of PrintValue is to insert suitable calls +into the code to be debugged. +

+

Another way is to call R_PV from the symbolic debugger. +(PrintValue is hidden as Rf_PrintValue.) For example, +from gdb we can use +

+
+
(gdb) p R_PV(ab)
+
+ +

using the object ab from the convolution example, if we have +placed a suitable breakpoint in the convolution C code. +

+

To examine an arbitrary R object we need to work a little harder. +For example, let +

+
+
R> DF <- data.frame(a = 1:3, b = 4:6)
+
+ +

By setting a breakpoint at do_get and typing get("DF") at +the R prompt, one can find out the address in memory of DF, for +example +

+
+
Value returned is $1 = (SEXPREC *) 0x40583e1c
+(gdb) p *$1
+$2 = {
+  sxpinfo = {type = 19, obj = 1, named = 1, gp = 0,
+    mark = 0, debug = 0, trace = 0, = 0},
+  attrib = 0x40583e80,
+  u = {
+    vecsxp = {
+      length = 2,
+      type = {c = 0x40634700 "0>X@D>X@0>X@", i = 0x40634700,
+	f = 0x40634700, z = 0x40634700, s = 0x40634700},
+      truelength = 1075851272,
+    },
+    primsxp = {offset = 2},
+    symsxp = {pname = 0x2, value = 0x40634700, internal = 0x40203008},
+    listsxp = {carval = 0x2, cdrval = 0x40634700, tagval = 0x40203008},
+    envsxp = {frame = 0x2, enclos = 0x40634700},
+    closxp = {formals = 0x2, body = 0x40634700, env = 0x40203008},
+    promsxp = {value = 0x2, expr = 0x40634700, env = 0x40203008}
+  }
+}
+
+ +

(Debugger output reformatted for better legibility). +

+

Using R_PV() one can “inspect” the values of the various +elements of the SEXP, for example, +

+
+
(gdb) p R_PV($1->attrib)
+$names
+[1] "a" "b"
+
+$row.names
+[1] "1" "2" "3"
+
+$class
+[1] "data.frame"
+
+$3 = void
+
+ +

To find out where exactly the corresponding information is stored, one +needs to go “deeper”: +

+
+
(gdb) set $a = $1->attrib
+(gdb) p $a->u.listsxp.tagval->u.symsxp.pname->u.vecsxp.type.c
+$4 = 0x405d40e8 "names"
+(gdb) p $a->u.listsxp.carval->u.vecsxp.type.s[1]->u.vecsxp.type.c
+$5 = 0x40634378 "b"
+(gdb) p $1->u.vecsxp.type.s[0]->u.vecsxp.type.i[0]
+$6 = 1
+(gdb) p $1->u.vecsxp.type.s[1]->u.vecsxp.type.i[1]
+$7 = 5
+
+ +

Another alternative is the R_inspect function which shows the +low-level structure of the objects recursively (addresses differ from +the above as this example is created on another machine): +

+
+
(gdb) p R_inspect($1)
+@100954d18 19 VECSXP g0c2 [OBJ,NAM(2),ATT] (len=2, tl=0)
+  @100954d50 13 INTSXP g0c2 [NAM(2)] (len=3, tl=0) 1,2,3
+  @100954d88 13 INTSXP g0c2 [NAM(2)] (len=3, tl=0) 4,5,6
+ATTRIB:
+  @102a70140 02 LISTSXP g0c0 []
+    TAG: @10083c478 01 SYMSXP g0c0 [MARK,NAM(2),gp=0x4000] "names"
+    @100954dc0 16 STRSXP g0c2 [NAM(2)] (len=2, tl=0)
+      @10099df28 09 CHARSXP g0c1 [MARK,gp=0x21] "a"
+      @10095e518 09 CHARSXP g0c1 [MARK,gp=0x21] "b"
+    TAG: @100859e60 01 SYMSXP g0c0 [MARK,NAM(2),gp=0x4000] "row.names"
+    @102a6f868 13 INTSXP g0c1 [NAM(2)] (len=2, tl=1) -2147483648,-3
+    TAG: @10083c948 01 SYMSXP g0c0 [MARK,gp=0x4000] "class"
+    @102a6f838 16 STRSXP g0c1 [NAM(2)] (len=1, tl=1)
+      @1008c6d48 09 CHARSXP g0c2 [MARK,gp=0x21,ATT] "data.frame"
+
+ +

In general the representation of each object follows the format: +

+
+
@<address> <type-nr> <type-name> <gc-info> [<flags>] ...
+
+ +

For a more fine-grained control over the depth of the recursion +and the output of vectors R_inspect3 takes additional two character() +parameters: maximum depth and the maximal number of elements that will +be printed for scalar vectors. The defaults in R_inspect are +currently -1 (no limit) and 5 respectively. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

5 System and foreign language interfaces

+ + + + + + + + + + + + + + + + + + + +
+ + + +

5.1 Operating system access

+ + +

Access to operating system functions is via the R functions +system and system2. + + +The details will differ by platform (see the on-line help), and about +all that can safely be assumed is that the first argument will be a +string command that will be passed for execution (not necessarily +by a shell) and the second argument to system will be +internal which if true will collect the output of the command +into an R character vector. +

+

On POSIX-compliant OSes these commands pass a command-line to a shell: +Windows is not POSIX-compliant and there is a separate function +shell to do so. +

+

The function system.time + +is available for timing. Timing on child processes is only available on +Unix-alikes, and may not be reliable there. +

+
+ + + +

5.2 Interface functions .C and .Fortran

+ + + + + +

These two functions provide an interface to compiled code that has been +linked into R, either at build time or via dyn.load +(see dyn.load and dyn.unload). They are primarily intended for +compiled C and FORTRAN 77 code respectively, but the .C function +can be used with other languages which can generate C interfaces, for +example C++ (see Interfacing C++ code). +

+

The first argument to each function is a character string specifying the +symbol name as known98 to C or +FORTRAN, that is the function or subroutine name. (That the symbol is +loaded can be tested by, for example, is.loaded("cg"). Use the +name you pass to .C or .Fortran rather than the translated +symbol name.) +

+

There can be up to 65 further arguments giving R objects to be passed +to compiled code. Normally these are copied before being passed in, and +copied again to an R list object when the compiled code returns. If +the arguments are given names, these are used as names for the +components in the returned list object (but not passed to the compiled +code). +

+

The following table gives the mapping between the modes of R atomic +vectors and the types of arguments to a C function or FORTRAN +subroutine. +

+
+ + + + + + + + +
R storage modeC typeFORTRAN type
logicalint *INTEGER
integerint *INTEGER
doubledouble *DOUBLE PRECISION
complexRcomplex *DOUBLE COMPLEX
characterchar **CHARACTER*255
rawunsigned char *none
+
+ +

Do please note the first two. On the 64-bit Unix/Linux/OS X platforms, +long is 64-bit whereas int and INTEGER are 32-bit. +Code ported from S-PLUS (which uses long * for logical and +integer) will not work on all 64-bit platforms (although it may +appear to work on some, including Windows). Note also that if your +compiled code is a mixture of C functions and FORTRAN subprograms the +argument types must match as given in the table above. +

+

C type Rcomplex is a structure with double members +r and i defined in the header file R_ext/Complex.h +included by R.h. (On most platforms this is stored in a way +compatible with the C99 double complex type: however, it may not +be possible to pass Rcomplex to a C99 function expecting a +double complex argument. Nor need it be compatible with a C++ +complex type. Moreover, the compatibility can depends on the +optimization level set for the compiler.) +

+

Only a single character string can be passed to or from FORTRAN, and the +success of this is compiler-dependent. Other R objects can be passed +to .C, but it is much better to use one of the other interfaces. +

+

It is possible to pass numeric vectors of storage mode double to +C as float * or to FORTRAN as REAL by setting the +attribute Csingle, most conveniently by using the R functions +as.single, single or mode. This is intended only +to be used to aid interfacing existing C or FORTRAN code. +

+

Logical values are sent as 0 (FALSE), 1 +(TRUE) or INT_MIN = -2147483648 (NA, but only if +NAOK is true), and the compiled code should return one of these +three values. (Non-zero values other than INT_MIN are mapped to +TRUE.) +

+

Unless formal argument NAOK is true, all the other arguments are +checked for missing values NA and for the IEEE special +values NaN, Inf and -Inf, and the presence of any +of these generates an error. If it is true, these values are passed +unchecked. +

+

Argument PACKAGE confines the search for the symbol name to a +specific shared object (or use "base" for code compiled into +R). Its use is highly desirable, as there is no way to avoid two +package writers using the same symbol name, and such name clashes are +normally sufficient to cause R to crash. (If it is not present and +the call is from the body of a function defined in a package namespace, +the shared object loaded by the first (if any) useDynLib +directive will be used. However, prior to R 2.15.2 the detection of +the correct namespace is unreliable and you are strongly recommended to +use the PACKAGE argument for packages to be used with earlier +versions of R. +

+

Note that the compiled code should not return anything except through +its arguments: C functions should be of type void and FORTRAN +subprograms should be subroutines. +

+

To fix ideas, let us consider a very simple example which convolves two +finite sequences. (This is hard to do fast in interpreted R code, but +easy in C code.) We could do this using .C by +

+
+
void convolve(double *a, int *na, double *b, int *nb, double *ab)
+{
+    int nab = *na + *nb - 1;
+
+    for(int i = 0; i < nab; i++)
+	ab[i] = 0.0;
+    for(int i = 0; i < *na; i++)
+	for(int j = 0; j < *nb; j++)
+	    ab[i + j] += a[i] * b[j];
+}
+
+ +

called from R by +

+
+
conv <- function(a, b)
+    .C("convolve",
+       as.double(a),
+       as.integer(length(a)),
+       as.double(b),
+       as.integer(length(b)),
+       ab = double(length(a) + length(b) - 1))$ab
+
+ +

Note that we take care to coerce all the arguments to the correct R +storage mode before calling .C; mistakes in matching the types +can lead to wrong results or hard-to-catch errors. +

+

Special care is needed in handling character vector arguments in +C (or C++). On entry the contents of the elements are duplicated and +assigned to the elements of a char ** array, and on exit the +elements of the C array are copied to create new elements of a character +vector. This means that the contents of the character strings of the +char ** array can be changed, including to \0 to shorten +the string, but the strings cannot be lengthened. It is +possible99 to allocate a new string via +R_alloc and replace an entry in the char ** array by the +new string. However, when character vectors are used other than in a +read-only way, the .Call interface is much to be preferred. +

+

Passing character strings to FORTRAN code needs even more care, and +should be avoided where possible. Only the first element of the +character vector is passed in, as a fixed-length (255) character array. +Up to 255 characters are passed back to a length-one character vector. +How well this works (or even if it works at all) depends on the C and +FORTRAN compilers on each platform (including on their options). Often +what is being passed to FORTRAN is one of a small set of possible values +(a factor in R terms) which could alternatively be passed as an +integer code: similarly FORTRAN code that wants to generate diagnostic +messages can pass an integer code to a C or R wrapper which will +convert it to a character string. +

+

It is possible to pass some R objects other than atomic vectors via +.C, but this is only supported for historical compatibility: use +the .Call or .External interfaces for such objects. Any +C/C++ code that includes Rinternals.h should be called via +.Call or .External. +

+
+ + + +

5.3 dyn.load and dyn.unload

+ + + + + +

Compiled code to be used with R is loaded as a shared object +(Unix-alikes including OS X, see Creating shared objects for more +information) or DLL (Windows). +

+

The shared object/DLL is loaded by dyn.load and unloaded by +dyn.unload. Unloading is not normally necessary, but it is +needed to allow the DLL to be re-built on some platforms, including +Windows. +

+

The first argument to both functions is a character string giving the +path to the object. Programmers should not assume a specific file +extension for the object/DLL (such as .so) but use a construction +like +

+
+
file.path(path1, path2, paste0("mylib", .Platform$dynlib.ext))
+
+ +

for platform independence. On Unix-alike systems the path supplied to +dyn.load can be an absolute path, one relative to the current +directory or, if it starts with ‘~’, relative to the user’s home +directory. +

+

Loading is most often done automatically based on the useDynLib() +declaration in the NAMESPACE file, but may be done +explicitly via a call to library.dynam. + +This has the form +

+
+
library.dynam("libname", package, lib.loc)
+
+ +

where libname is the object/DLL name with the extension +omitted. Note that the first argument, chname, should +not be package since this will not work if the package +is installed under another name. +

+

Under some Unix-alike systems there is a choice of how the symbols are +resolved when the object is loaded, governed by the arguments +local and now. Only use these if really necessary: in +particular using now=FALSE and then calling an unresolved symbol +will terminate R unceremoniously. +

+

R provides a way of executing some code automatically when a object/DLL +is either loaded or unloaded. This can be used, for example, to +register native routines with R’s dynamic symbol mechanism, initialize +some data in the native code, or initialize a third party library. On +loading a DLL, R will look for a routine within that DLL named +R_init_lib where lib is the name of the DLL file with +the extension removed. For example, in the command +

+
+
library.dynam("mylib", package, lib.loc)
+
+ +

R looks for the symbol named R_init_mylib. Similarly, when +unloading the object, R looks for a routine named +R_unload_lib, e.g., R_unload_mylib. In either case, +if the routine is present, R will invoke it and pass it a single +argument describing the DLL. This is a value of type DllInfo +which is defined in the Rdynload.h file in the R_ext +directory. +

+

Note that there are some implicit restrictions on this mechanism as the +basename of the DLL needs to be both a valid file name and valid as part +of a C entry point (e.g. it cannot contain ‘.’): for portable +code it is best to confine DLL names to be ASCII alphanumeric +plus underscore. If entry point R_init_lib is not found it +is also looked for with ‘.’ replaced by ‘_’. +

+ +

The following example shows templates for the initialization and +unload routines for the mylib DLL. +

+
+
+
+
#include <R.h>
+#include <Rinternals.h>
+#include <R_ext/Rdynload.h>
+
+void
+R_init_mylib(DllInfo *info)
+{
+  /* Register routines,
+     allocate resources. */
+}
+
+void
+R_unload_mylib(DllInfo *info)
+{
+  /* Release resources. */
+}
+
+
+
+ +

If a shared object/DLL is loaded more than once the most recent version +is used. More generally, if the same symbol name appears in several +shared objects, the most recently loaded occurrence is used. The +PACKAGE argument and registration (see the next section) provide +good ways to avoid any ambiguity in which occurrence is meant. +

+

On Unix-alikes the paths used to resolve dynamically linked dependent +libraries are fixed (for security reasons) when the process is launched, +so dyn.load will only look for such libraries in the locations +set by the R shell script (via etc/ldpaths) and in +the OS-specific defaults. +

+

Windows allows more control (and less security) over where dependent +DLLs are looked for. On all versions this includes the PATH +environment variable, but with lowest priority: note that it does not +include the directory from which the DLL was loaded. It is possible to +add a single path with quite high priority via the DLLpath +argument to dyn.load. This is (by default) used by +library.dynam to include the package’s libs/i386 or +libs/x64 directory in the DLL search path. +

+ +
+ + + +

5.4 Registering native routines

+ + +

By ‘native’ routine, we mean an entry point in compiled code. +

+

In calls to .C, .Call, .Fortran and +.External, R must locate the specified native routine by +looking in the appropriate shared object/DLL. By default, R uses the +operating system-specific dynamic loader to lookup the symbol in all +loaded DLLs and elsewhere. Alternatively, the author of the DLL +can explicitly register routines with R and use a single, +platform-independent mechanism for finding the routines in the DLL. One +can use this registration mechanism to provide additional information +about a routine, including the number and type of the arguments, and +also make it available to R programmers under a different name. In +the future, registration may be used to implement a form of “secure” +or limited native access. +

+ +

To register routines with R, one calls the C routine +R_registerRoutines. This is typically done when the DLL is first +loaded within the initialization routine R_init_dll name +described in dyn.load and dyn.unload. R_registerRoutines +takes 5 arguments. The first is the DllInfo object passed by +R to the initialization routine. This is where R stores the +information about the methods. The remaining 4 arguments are arrays +describing the routines for each of the 4 different interfaces: +.C, .Call, .Fortran and .External. Each +argument is a FIND-terminated array of the element types given in +the following table: +

+
+ + + + + +
.CR_CMethodDef
.CallR_CallMethodDef
.FortranR_FortranMethodDef
.ExternalR_ExternalMethodDef
+
+ +

Currently, the R_ExternalMethodDef is the same as +R_CallMethodDef type and contains fields for the name of the +routine by which it can be accessed in R, a pointer to the actual +native symbol (i.e., the routine itself), and the number of arguments +the routine expects to be passed from R. For example, if we had a +routine named myCall defined as +

+
+
SEXP myCall(SEXP a, SEXP b, SEXP c);
+
+ +

we would describe this as +

+
+
static R_CallMethodDef callMethods[]  = {
+  {"myCall", (DL_FUNC) &myCall, 3},
+  {NULL, NULL, 0}
+};
+
+ +

along with any other routines for the .Call interface. For +routines with a variable number of arguments invoked via the +.External interface, one specifies -1 for the number of +arguments which tells R not to check the actual number passed. Note +that the number of arguments passed to .External were not +checked prior to R 3.0.0. +

+

Routines for use with the .C and .Fortran interfaces are +described with similar data structures, but which have two additional +fields for describing the type and “style” of each argument. Each of +these can be omitted. However, if specified, each should be an array +with the same number of elements as the number of parameters for the +routine. The types array should contain the SEXP types +describing the expected type of the argument. (Technically, the elements +of the types array are of type R_NativePrimitiveArgType which is +just an unsigned integer.) The R types and corresponding type +identifiers are provided in the following table: +

+
+ + + + + + + +
numericREALSXP
integerINTSXP
logicalLGLSXP
singleSINGLESXP
characterSTRSXP
listVECSXP
+
+ +

Consider a C routine, myC, declared as +

+
+
void myC(double *x, int *n, char **names, int *status);
+
+ +

We would register it as +

+
+
static R_NativePrimitiveArgType myC_t[] = {
+    REALSXP, INTSXP, STRSXP, LGLSXP
+};
+
+static R_CMethodDef cMethods[] = {
+   {"myC", (DL_FUNC) &myC, 4, myC_t}
+   {NULL, NULL, 0}
+};
+
+ +

One can also specify whether each argument is used simply as input, or +as output, or as both input and output. The style field in the +description of a method is used for this. The purpose is to +allow100 R to transfer values +more efficiently across the R-C/FORTRAN interface by avoiding copying +values when it is not necessary. Typically, one omits this information +in the registration data. +

+

Having created the arrays describing each routine, the last step is to +actually register them with R. We do this by calling +R_registerRoutines. For example, if we have the descriptions +above for the routines accessed by the .C and .Call +we would use the following code: +

+
+
void
+R_init_myLib(DllInfo *info)
+{
+   R_registerRoutines(info, cMethods, callMethods, NULL, NULL);
+}
+
+ +

This routine will be invoked when R loads the shared object/DLL named +myLib. The last two arguments in the call to +R_registerRoutines are for the routines accessed by +.Fortran and .External interfaces. In our example, these +are given as NULL since we have no routines of these types. +

+

When R unloads a shared object/DLL, its registrations are +automatically removed. There is no other facility for unregistering a +symbol. +

+

Examples of registering routines can be found in the different packages +in the R source tree (e.g., stats). Also, there is a +brief, high-level introduction in R News (volume 1/3, September +2001, pages 20–23, https://www.r-project.org/doc/Rnews/Rnews_2001-3.pdf). +

+

Once routines are registered, they can be referred to as R objects if +they this is arranged in the useDynLib call in the package’s +NAMESPACE file (see useDynLib). This avoids the overhead +of looking up an entry point each time it is used, and ensure that the +entry point in the package is the one used (without a PACKAGE = +"pkg" argument). So for example the stats package has +

+
# Refer to all C/Fortran routines by their name prefixed by C_
+useDynLib(stats, .registration = TRUE, .fixes = "C_")
+
+

in its NAMESPACE file, and then ansari.test’s default +methods can contain +

+
	pansari <- function(q, m, n)
+	    .C(C_pansari, as.integer(length(q)), p = as.double(q),
+		as.integer(m), as.integer(n))$p
+
+ + + + + + + + +
+ + + +

5.4.1 Speed considerations

+ +

Sometimes registering native routines or using a PACKAGE argument +can make a large difference. The results can depend quite markedly on +the OS (and even if it is 32- or 64-bit), on the version of R and +what else is loaded into R at the time. +

+

To fix ideas, first consider x84_64 OS 10.7 and R 2.15.2. A +simple .Call function might be +

+
foo <- function(x) .Call("foo", x)
+
+

with C code +

+
SEXP foo(SEXP x)
+{
+    return x;
+}
+
+

If we compile with by R CMD SHLIB foo.c, load the code by +dyn.load("foo.so") and run foo(pi) it took around 22 +microseconds (us). Specifying the DLL by +

+
foo2 <- function(x) .Call("foo", x, PACKAGE = "foo")
+
+

reduced the time to 1.7 us. +

+

Now consider making these functions part of a package whose +NAMESPACE file uses useDynlib(foo). This immediately +reduces the running time as "foo" will be preferentially looked +for foo.dll. Without specifying PACKAGE it took about 5 +us (it needs to fathom out the appropriate DLL each time it is invoked +but it does not need to search all DLLs), and with the PACKAGE +argument it is again about 1.7 us. +

+

Next suppose the package has registered the native routine foo. +Then foo() still has to find the appropriate DLL but can get to +the entry point in the DLL faster, in about 4.2 us. And foo2() +now takes about 1 us. If we register the symbols in the +NAMESPACE file and use +

+
foo3 <- function(x) .Call(C_foo, x)
+
+

then the address for the native routine is looked up just once when the +package is loaded, and foo3(pi) takes about 0.8 us. +

+

Versions using .C() rather than .Call() take about 0.2 us +longer. +

+

These are all quite small differences, but C routines are not uncommonly +invoked millions of times for run times of a few microseconds, and those +doing such things may wish to be aware of the differences. +

+

On Linux and Solaris there is a much smaller overhead in looking up +symbols so foo(pi) takes around 5 times as long as +foo3(pi). +

+

Symbol lookup on Windows used to be far slower, so R maintains a +small cache. If the cache is currently empty enough that the symbol can +be stored in the cache then the performance is similar to Linux and +Solaris: if not it may be slower. R’s own code always uses +registered symbols and so these never contribute to the cache: however +many other packages do rely on symbol lookup. +

+ + +
+ + + +

5.4.2 Linking to native routines in other packages

+ +

In addition to registering C routines to be called by R, it can at +times be useful for one package to make some of its C routines available +to be called by C code in another package. The interface consists of +two routines declared in header R_ext/Rdynload.h as +

+ + +
+
void R_RegisterCCallable(const char *package, const char *name,
+			 DL_FUNC fptr);
+DL_FUNC R_GetCCallable(const char *package, const char *name);
+
+ +

A package packA that wants to make a C routine myCfun +available to C code in other packages would include the call +

+
+
R_RegisterCCallable("packA", "myCfun", myCfun);
+
+ +

in its initialization function R_init_packA. A package +packB that wants to use this routine would retrieve the function +pointer with a call of the form +

+
+
p_myCfun = R_GetCCallable("packA", "myCfun");
+
+ +

The author of packB is responsible for ensuring that +p_myCfun has an appropriate declaration. In the future R may +provide some automated tools to simplify exporting larger numbers of +routines. +

+

A package that wishes to make use of header files in other packages +needs to declare them as a comma-separated list in the field +‘LinkingTo’ in the DESCRIPTION file. This then arranges +that the include directories in the installed linked-to packages +are added to the include paths for C and C++ code. +

+

It must specify101 +‘Imports’ or ‘Depends’ of those packages, for they have to be +loaded102 prior to this one +(so the path to their compiled code has been registered). +

+ +

A CRAN example of the use of this mechanism is package +lme4, which links to Matrix. +

+
+ + + +

5.5 Creating shared objects

+ + + +

Shared objects for loading into R can be created using R CMD +SHLIB. This accepts as arguments a list of files which must be object +files (with extension .o) or sources for C, C++, FORTRAN 77, +Fortran 9x, Objective C or Objective C++ (with extensions .c, +.cc or .cpp, .f, .f90 or .f95, +.m, and .mm or .M, respectively), or commands to be +passed to the linker. See R CMD SHLIB --help (or the R help +for SHLIB) for usage information. +

+

If compiling the source files does not work “out of the box”, you can +specify additional flags by setting some of the variables + +PKG_CPPFLAGS (for the C preprocessor, typically ‘-I’ flags), + + + + + + +PKG_CFLAGS, PKG_CXXFLAGS, PKG_FFLAGS, +PKG_FCFLAGS, PKG_OBJCFLAGS, and PKG_OBJCXXFLAGS +(for the C, C++, FORTRAN 77, Fortran 9x, Objective C, and Objective C++ +compilers, respectively) in the file Makevars in the compilation +directory (or, of course, create the object files directly from the +command line). + +Similarly, variable PKG_LIBS in Makevars can be used for +additional ‘-l’ and ‘-L’ flags to be passed to the linker when +building the shared object. (Supplying linker commands as arguments to +R CMD SHLIB will take precedence over PKG_LIBS in +Makevars.) +

+ +

It is possible to arrange to include compiled code from other languages +by setting the macro ‘OBJECTS’ in file Makevars, together +with suitable rules to make the objects. +

+

Flags which are already set (for example in file +etcR_ARCH/Makeconf) can be overridden by the environment +variable MAKEFLAGS (at least for systems using a POSIX-compliant +make), as in (Bourne shell syntax) +

+
+
MAKEFLAGS="CFLAGS=-O3" R CMD SHLIB *.c
+
+ +

It is also possible to set such variables in personal Makevars +files, which are read after the local Makevars and the system +makefiles or in a site-wide Makevars.site file. +See Customizing package compilation in R Installation and Administration, +

+ +

Note that as R CMD SHLIB uses Make, it will not remake a shared +object just because the flags have changed, and if test.c and +test.f both exist in the current directory +

+
+
R CMD SHLIB test.f
+
+ +

will compile test.c! +

+ +

If the src subdirectory of an add-on package contains source code +with one of the extensions listed above or a file Makevars but +not a file Makefile, R CMD INSTALL creates a +shared object (for loading into R through useDynlib in the +NAMESPACE, or in the .onLoad function of the package) +using the R CMD SHLIB mechanism. If file Makevars +exists it is read first, then the system makefile and then any personal +Makevars files. +

+

If the src subdirectory of package contains a file +Makefile, this is used by R CMD INSTALL in place of the +R CMD SHLIB mechanism. make is called with makefiles +R_HOME/etcR_ARCH/Makeconf, src/Makefile and +any personal Makevars files (in that order). The first target +found in src/Makefile is used. +

+

It is better to make use of a Makevars file rather than a +Makefile: the latter should be needed only exceptionally. +

+ +

Under Windows the same commands work, but Makevars.win will be +used in preference to Makevars, and only src/Makefile.win +will be used by R CMD INSTALL with src/Makefile being +ignored. For past experiences of building DLLs with a variety of +compilers, see file ‘README.packages’ and +http://www.stats.uwo.ca/faculty/murdoch/software/compilingDLLs/ +. Under Windows you can supply an exports definitions file called +dllname-win.def: otherwise all entry points in objects (but +not libraries) supplied to R CMD SHLIB will be exported from the +DLL. An example is stats-win.def for the stats package: a +CRAN example in package fastICA. +

+

If you feel tempted to read the source code and subvert these +mechanisms, please resist. Far too much developer time has been wasted +in chasing down errors caused by failures to follow this documentation, +and even more by package authors demanding explanations as to why their +packages no longer work. +In particular, undocumented environment or make variables are +not for use by package writers and are subject to change without notice. +

+
+ + + +

5.6 Interfacing C++ code

+ + + +

Suppose we have the following hypothetical C++ library, consisting of +the two files X.h and X.cpp, and implementing the two +classes X and Y which we want to use in R. +

+
+
+
+
// X.h
+
+class X {
+public: X (); ~X ();
+};
+
+class Y {
+public: Y (); ~Y ();
+};
+
+
+
+ +
+
+
+
// X.cpp
+
+#include <R.h>
+#include "X.h"
+
+static Y y;
+
+X::X()  { REprintf("constructor X\n"); }
+X::~X() { REprintf("destructor X\n");  }
+Y::Y()  { REprintf("constructor Y\n"); }
+Y::~Y() { REprintf("destructor Y\n");  }
+
+
+
+ +

To use with R, the only thing we have to do is writing a wrapper +function and ensuring that the function is enclosed in +

+
+
extern "C" {
+
+}
+
+ +

For example, +

+
+
+
+
// X_main.cpp:
+
+#include "X.h"
+
+extern "C" {
+
+void X_main () {
+  X x;
+}
+
+} // extern "C"
+
+
+
+ +

Compiling and linking should be done with the C++ compiler-linker +(rather than the C compiler-linker or the linker itself); otherwise, the +C++ initialization code (and hence the constructor of the static +variable Y) are not called. On a properly configured system, one +can simply use +

+
+
R CMD SHLIB X.cpp X_main.cpp
+
+ +

to create the shared object, typically X.so (the file name +extension may be different on your platform). Now starting R yields +

+
+
R version 2.14.1 Patched (2012-01-16 r58124)
+Copyright (C) 2012 The R Foundation for Statistical Computing
+...
+Type    "q()" to quit R.
+
+
R> dyn.load(paste("X", .Platform$dynlib.ext, sep = ""))
+constructor Y
+R> .C("X_main")
+constructor X
+destructor X
+list()
+R> q()
+Save workspace image? [y/n/c]: y
+destructor Y
+
+ +

The R for Windows FAQ (rw-FAQ) contains details of how +to compile this example under Windows. +

+

Earlier version of this example used C++ iostreams: this is best +avoided. There is no guarantee that the output will appear in the R +console, and indeed it will not on the R for Windows console. Use +R code or the C entry points (see Printing) for all I/O if at all +possible. Examples have been seen where merely loading a DLL that +contained calls to C++ I/O upset R’s own C I/O (for example by +resetting buffers on open files). +

+

Most R header files can be included within C++ programs, and they +should not be included within an extern "C" block (as +they include C++ system headers). It may not be possible to include +some R headers as they in turn include C header files that may cause +conflicts—if this happens, define ‘NO_C_HEADERS’ before including +the R headers, and include C++ versions (such as ‘cmath’) of the +appropriate headers yourself before the R headers. +

+
+ + + +

5.7 Fortran I/O

+ +

We have already warned against the use of C++ iostreams not least +because output is not guaranteed to appear on the R console, and this +warning applies equally to Fortran (77 or 9x) output to units * +and 6. See Printing from FORTRAN, which describes workarounds. +

+

In the past most Fortran compilers implemented I/O on top of the C I/O +system and so the two interworked successfully. This was true of +g77, but it is less true of gfortran as used in +gcc 4.y.z. In particular, any package that makes use of Fortran +I/O will when compiled on Windows interfere with C I/O: when the Fortran +I/O is initialized (typically when the package is loaded) the C +stdout and stderr are switched to LF line endings. +(Function init in file src/modules/lapack/init_win.c shows how to +mitigate this.) +

+
+ + + +

5.8 Linking to other packages

+ +

It is not in general possible to link a DLL in package packA to a +DLL provided by package packB (for the security reasons mentioned +in dyn.load and dyn.unload, and also because some platforms +distinguish between shared objects and dynamic libraries), but it is on +Windows. +

+

Note that there can be tricky versioning issues here, as package +packB could be re-installed after package packA — it is +desirable that the API provided by package packB remains +backwards-compatible. +

+

Shipping a static library in package packB for other packages to +link to avoids most of the difficulties. +

+ + + + + +
+ + + +

5.8.1 Unix-alikes

+ +

It is possible to link a shared object in package packA to a +library provided by package packB under limited circumstances +on a Unix-alike OS. There are severe portability issues, so this is not +recommended for a distributed package. +

+

This is easiest if packB provides a static library +packB/lib/libpackB.a. (Note using directory lib rather +than libs is conventional, and architecture-specific +sub-directories may be needed and are assumed in the sample code +below. The code in the static library will need to be compiled with +PIC flags on platforms where it matters.) Then as the code from +package packB is incorporated when package packA is +installed, we only need to find the static library at install time for +package packA. The only issue is to find package packB, and +for that we can ask R by something like (long lines broken for +display here) +

+
+
PKGB_PATH=‘echo ’library(packB);
+  cat(system.file("lib",  package="packB", mustWork=TRUE))' \
+ | "${R_HOME}/bin/R" --vanilla --slave`
+PKG_LIBS="$(PKGB_PATH)$(R_ARCH)/libpackB.a"
+
+ +

For a dynamic library packB/lib/libpackB.so +(packB/lib/libpackB.dylib on OS X: note that you cannot link to +a shared object, .so, on that platform) we could use +

+
+
PKGB_PATH=‘echo ’library(packB);
+  cat(system.file("lib", package="packB", mustWork=TRUE))' \
+ | "${R_HOME}/bin/R" --vanilla --slave`
+PKG_LIBS=-L"$(PKGB_PATH)$(R_ARCH)" -lpackB
+
+ +

This will work for installation, but very likely not when package +packB is loaded, as the path to package packB’s lib +directory is not in the ld.so103 search path. You can arrange to +put it there before R is launched by setting (on some +platforms) LD_RUN_PATH or LD_LIBRARY_PATH or adding to the +ld.so cache (see man ldconfig). On platforms that +support it, the path to the directory containing the dynamic library can +be hardcoded at install time (which assumes that the location of package +packB will not be changed nor the package updated to a changed +API). On systems with the gcc or clang and the +GNU linker (e.g. Linux) and some others this can be done by +e.g. +

+
+
PKGB_PATH=‘echo ’library(packB);
+  cat(system.file("lib", package="packB", mustWork=TRUE)))' \
+ | "${R_HOME}/bin/R" --vanilla --slave`
+PKG_LIBS=-L"$(PKGB_PATH)$(R_ARCH)" -Wl,-rpath,"$(PKGB_PATH)$(R_ARCH)" -lpackB
+
+ +

Some other systems (e.g. Solaris with its native linker) use +-Rdir rather than -rpath,dir (and this is accepted by +the compiler as well as the linker). +

+

It may be possible to figure out what is required semi-automatically +from the result of R CMD libtool --config (look for +‘hardcode’). +

+

Making headers provided by package packB available to the code to +be compiled in package packA can be done by the LinkingTo +mechanism (see Registering native routines). +

+ +
+ + + +

5.8.2 Windows

+ +

Suppose package packA wants to make use of compiled code provided +by packB in DLL packB/libs/exB.dll, possibly the package’s +DLL packB/libs/packB.dll. (This can be extended to linking to +more than one package in a similar way.) There are three issues to be +addressed: +

+
    +
  • Making headers provided by package packB available to the code to +be compiled in package packA. + +

    This is done by the LinkingTo mechanism (see Registering native routines). +

    +
  • preparing packA.dll to link to packB/libs/exB.dll. + +

    This needs an entry in Makevars.win of the form +

    +
    +
    PKG_LIBS= -L<something> -lexB
    +
    + +

    and one possibility is that <something> is the path to the +installed pkgB/libs directory. To find that we need to ask R +where it is by something like +

    +
    +
    PKGB_PATH=‘echo ’library(packB);
    +  cat(system.file("libs", package="packB", mustWork=TRUE))' \
    + | rterm --vanilla --slave`
    +PKG_LIBS= -L"$(PKGB_PATH)$(R_ARCH)" -lexB
    +
    + +

    Another possibility is to use an import library, shipping with package +packA an exports file exB.def. Then Makevars.win +could contain +

    +
    +
    PKG_LIBS= -L. -lexB
    +
    +all: $(SHLIB) before
    +
    +before: libexB.dll.a
    +libexB.dll.a: exB.def
    +
    + +

    and then installing package packA will make and use the import +library for exB.dll. (One way to prepare the exports file is to +use pexports.exe.) +

    +
  • loading packA.dll which depends on exB.dll. + +

    If exB.dll was used by package packB (because it is in fact +packB.dll or packB.dll depends on it) and packB has +been loaded before packA, then nothing more needs to be done as +exB.dll will already be loaded into the R executable. (This +is the most common scenario.) +

    +

    More generally, we can use the DLLpath argument to +library.dynam to ensure that exB.dll is found, for example +by setting +

    +
    +
    library.dynam("packA", pkg, lib,
    +	      DLLpath = system.file("libs", package="packB"))
    +
    + +

    Note that DLLpath can only set one path, and so for linking to +two or more packages you would need to resort to setting environment +variable PATH. +

    +
+ +
+ + + +

5.9 Handling R objects in C

+ + +

Using C code to speed up the execution of an R function is often very +fruitful. Traditionally this has been done via the .C +function in R. However, if a user wants to write C code using +internal R data structures, then that can be done using the +.Call and .External functions. The syntax for the calling +function in R in each case is similar to that of .C, but the +two functions have different C interfaces. Generally the .Call +interface is simpler to use, but .External is a little more +general. + + +

+

A call to .Call is very similar to .C, for example +

+
+
.Call("convolve2", a, b)
+
+ +

The first argument should be a character string giving a C symbol name +of code that has already been loaded into R. Up to 65 R objects +can passed as arguments. The C side of the interface is +

+
+
#include <R.h>
+#include <Rinternals.h>
+
+SEXP convolve2(SEXP a, SEXP b)
+ ...
+
+ +

A call to .External is almost identical +

+
+
.External("convolveE", a, b)
+
+ +

but the C side of the interface is different, having only one argument +

+
+
#include <R.h>
+#include <Rinternals.h>
+
+SEXP convolveE(SEXP args)
+ ...
+
+ +

Here args is a LISTSXP, a Lisp-style pairlist from which +the arguments can be extracted. +

+

In each case the R objects are available for manipulation via +a set of functions and macros defined in the header file +Rinternals.h or some S-compatibility macros104 defined +in Rdefines.h. See Interface functions .Call and .External +for details on .Call and .External. +

+

Before you decide to use .Call or .External, you should +look at other alternatives. First, consider working in interpreted R +code; if this is fast enough, this is normally the best option. You +should also see if using .C is enough. If the task to be +performed in C is simple enough involving only atomic vectors and +requiring no call to R, .C suffices. A great deal of useful +code was written using just .C before .Call and +.External were available. These interfaces allow much more +control, but they also impose much greater responsibilities so need to +be used with care. Neither .Call nor .External copy their +arguments: you should treat arguments you receive through these +interfaces as read-only. +

+

To handle R objects from within C code we use the macros and functions +that have been used to implement the core parts of R. A +public105 subset of these is defined in the header file +Rinternals.h in the directory R_INCLUDE_DIR (default +R_HOME/include) that should be available on any R +installation. +

+

A substantial amount of R, including the standard packages, is +implemented using the functions and macros described here, so the R +source code provides a rich source of examples and “how to do it”: do +make use of the source code for inspirational examples. +

+

It is necessary to know something about how R objects are handled in +C code. All the R objects you will deal with will be handled with +the type SEXP106, which is a +pointer to a structure with typedef SEXPREC. Think of this +structure as a variant type that can handle all the usual types +of R objects, that is vectors of various modes, functions, +environments, language objects and so on. The details are given later +in this section and in R Internal +Structures in R Internals, but for most +purposes the programmer does not need to know them. Think rather of a +model such as that used by Visual Basic, in which R objects are +handed around in C code (as they are in interpreted R code) as the +variant type, and the appropriate part is extracted for, for example, +numerical calculations, only when it is needed. As in interpreted R +code, much use is made of coercion to force the variant object to the +right type. +

+ + + + + + + + + + + + + +
+ + + +

5.9.1 Handling the effects of garbage collection

+ + + + + +

We need to know a little about the way R handles memory allocation. +The memory allocated for R objects is not freed by the user; instead, +the memory is from time to time garbage collected. That is, some +or all of the allocated memory not being used is freed or marked as +re-usable. +

+

The R object types are represented by a C structure defined by a +typedef SEXPREC in Rinternals.h. It contains several +things among which are pointers to data blocks and to other +SEXPRECs. A SEXP is simply a pointer to a SEXPREC. +

+

If you create an R object in your C code, you must tell R that you +are using the object by using the PROTECT macro on a pointer to +the object. This tells R that the object is in use so it is not +destroyed during garbage collection. Notice that it is the object which +is protected, not the pointer variable. It is a common mistake to +believe that if you invoked PROTECT(p) at some point then +p is protected from then on, but that is not true once a new +object is assigned to p. +

+

Protecting an R object automatically protects all the R objects +pointed to in the corresponding SEXPREC, for example all elements +of a protected list are automatically protected. +

+

The programmer is solely responsible for housekeeping the calls to +PROTECT. There is a corresponding macro UNPROTECT that +takes as argument an int giving the number of objects to +unprotect when they are no longer needed. The protection mechanism is +stack-based, so UNPROTECT(n) unprotects the last n +objects which were protected. The calls to PROTECT and +UNPROTECT must balance when the user’s code returns. R will +warn about "stack imbalance in .Call" (or .External) if +the housekeeping is wrong. +

+

Here is a small example of creating an R numeric vector in C code: +

+
+
#include <R.h>
+#include <Rinternals.h>
+
+    SEXP ab;
+      ....
+    ab = PROTECT(allocVector(REALSXP, 2));
+    REAL(ab)[0] = 123.45;
+    REAL(ab)[1] = 67.89;
+    UNPROTECT(1);
+
+ +

Now, the reader may ask how the R object could possibly get removed +during those manipulations, as it is just our C code that is running. +As it happens, we can do without the protection in this example, but in +general we do not know (nor want to know) what is hiding behind the R +macros and functions we use, and any of them might cause memory to be +allocated, hence garbage collection and hence our object ab to be +removed. It is usually wise to err on the side of caution and assume +that any of the R macros and functions might remove the object. +

+

In some cases it is necessary to keep better track of whether protection +is really needed. Be particularly aware of situations where a large +number of objects are generated. The pointer protection stack has a +fixed size (default 10,000) and can become full. It is not a good idea +then to just PROTECT everything in sight and UNPROTECT +several thousand objects at the end. It will almost invariably be +possible to either assign the objects as part of another object (which +automatically protects them) or unprotect them immediately after use. +

+

Protection is not needed for objects which R already knows are in +use. In particular, this applies to function arguments. +

+

There is a less-used macro UNPROTECT_PTR(s) that unprotects +the object pointed to by the SEXP s, even if it is not the +top item on the pointer protection stack. This is rarely needed outside +the parser (the R sources currently have three examples, one in +src/main/plot3d.c). + +

+

Sometimes an object is changed (for example duplicated, coerced or +grown) yet the current value needs to be protected. For these cases +PROTECT_WITH_INDEX saves an index of the protection location that +can be used to replace the protected value using REPROTECT. + + +For example (from the internal code for optim) +

+
+
    PROTECT_INDEX ipx;
+
+    ....
+    s = PROTECT_WITH_INDEX(eval(OS->R_fcall, OS->R_env), &ipx);
+    s = REPROTECT(coerceVector(s, REALSXP), ipx);
+
+ +

Note that it is dangerous to mix UNPROTECT_PTR with +PROTECT_WITH_INDEX, as the former changes the protection +locations of objects that were protected after the one being +unprotected. +

+ + +

There is another way to avoid the affects of garbage collection: a call +to R_PreserveObject adds an object to an internal list of objects +not to be collects, and a subsequent call to R_ReleaseObject +removes it from that list. This provides a way for objects which are +not returned as part of R objects to be protected across calls to +compiled code: on the other hand it becomes the user’s responsibility to +release them when they are no longer needed (and this often requires the +use of a finalizer). It is less efficient that the normal protection +mechanism, and should be used sparingly. +

+
+ + + +

5.9.2 Allocating storage

+ + +

For many purposes it is sufficient to allocate R objects and +manipulate those. There are quite a few allocXxx functions +defined in Rinternals.h—you may want to explore them. +

+ +

One that is commonly used is allocVector, the C-level equivalent +of R-level vector() and its wrappers such as integer() +and character(). One distinction is that whereas the R +functions always initialize the elements of the vector, +allocVector only does so for lists, expressions and character +vectors (the cases where the elements are themselves R objects). +

+

If storage is required for C objects during the calculations this is +best allocating by calling R_alloc; see Memory allocation. +All of these memory allocation routines do their own error-checking, so +the programmer may assume that they will raise an error and not return +if the memory cannot be allocated. +

+
+ + + +

5.9.3 Details of R types

+ + +

Users of the Rinternals.h macros will need to know how the R +types are known internally. The different R data types are +represented in C by SEXPTYPE. Some of these are familiar from +R and some are internal data types. The usual R object modes are +given in the table. +

+
+ + + + + + + + + + + + + + +
SEXPTYPER equivalent
REALSXPnumeric with storage mode double
INTSXPinteger
CPLXSXPcomplex
LGLSXPlogical
STRSXPcharacter
VECSXPlist (generic vector)
LISTSXPpairlist
DOTSXPa ‘’ object
NILSXPNULL
SYMSXPname/symbol
CLOSXPfunction or function closure
ENVSXPenvironment
+
+ +

Among the important internal SEXPTYPEs are LANGSXP, +CHARSXP, PROMSXP, etc. (N.B.: although it is +possible to return objects of internal types, it is unsafe to do so as +assumptions are made about how they are handled which may be violated at +user-level evaluation.) More details are given in R Internal Structures in R Internals. +

+

Unless you are very sure about the type of the arguments, the code +should check the data types. Sometimes it may also be necessary to +check data types of objects created by evaluating an R expression in +the C code. You can use functions like isReal, isInteger +and isString to do type checking. See the header file +Rinternals.h for definitions of other such functions. All of +these take a SEXP as argument and return 1 or 0 to indicate +TRUE or FALSE. +

+

What happens if the SEXP is not of the correct type? Sometimes +you have no other option except to generate an error. You can use the +function error for this. It is usually better to coerce the +object to the correct type. For example, if you find that an +SEXP is of the type INTEGER, but you need a REAL +object, you can change the type by using +

+
+
newSexp = PROTECT(coerceVector(oldSexp, REALSXP));
+
+ +

Protection is needed as a new object is created; the object +formerly pointed to by the SEXP is still protected but now +unused.107 +

+

All the coercion functions do their own error-checking, and generate +NAs with a warning or stop with an error as appropriate. +

+

Note that these coercion functions are not the same as calling +as.numeric (and so on) in R code, as they do not dispatch on +the class of the object. Thus it is normally preferable to do the +coercion in the calling R code. +

+

So far we have only seen how to create and coerce R objects from C +code, and how to extract the numeric data from numeric R vectors. +These can suffice to take us a long way in interfacing R objects to +numerical algorithms, but we may need to know a little more to create +useful return objects. +

+
+ + + +

5.9.4 Attributes

+ + +

Many R objects have attributes: some of the most useful are classes +and the dim and dimnames that mark objects as matrices or +arrays. It can also be helpful to work with the names attribute +of vectors. +

+

To illustrate this, let us write code to take the outer product of two +vectors (which outer and %o% already do). As usual the +R code is simple +

+
+
out <- function(x, y)
+{
+    storage.mode(x) <- storage.mode(y) <- "double"
+    .Call("out", x, y)
+}
+
+ +

where we expect x and y to be numeric vectors (possibly +integer), possibly with names. This time we do the coercion in the +calling R code. +

+

C code to do the computations is +

+
+
#include <R.h>
+#include <Rinternals.h>
+
+SEXP out(SEXP x, SEXP y)
+{
+    int nx = length(x), ny = length(y);
+    SEXP ans = PROTECT(allocMatrix(REALSXP, nx, ny));
+    double *rx = REAL(x), *ry = REAL(y), *rans = REAL(ans);
+    for(int i = 0; i < nx; i++) {
+	double tmp = rx[i];
+	for(int j = 0; j < ny; j++)
+	    rans[i + nx*j] = tmp * ry[j];
+    }
+    UNPROTECT(1);
+    return ans;
+}
+
+ +

Note the way REAL is used: as it is a function call it can be +considerably faster to store the result and index that. +

+

However, we would like to set the dimnames of the result. We can use +

+
+
#include <R.h>
+#include <Rinternals.h>
+
+
SEXP out(SEXP x, SEXP y)
+{
+    int nx = length(x), ny = length(y);
+    SEXP ans = PROTECT(allocMatrix(REALSXP, nx, ny));
+    double *rx = REAL(x), *ry = REAL(y), *rans = REAL(ans);
+
+    for(int i = 0; i < nx; i++) {
+      double tmp = rx[i];
+      for(int j = 0; j < ny; j++)
+	rans[i + nx*j] = tmp * ry[j];
+    }
+
+    SEXP dimnames = PROTECT(allocVector(VECSXP, 2));
+    SET_VECTOR_ELT(dimnames, 0, getAttrib(x, R_NamesSymbol));
+    SET_VECTOR_ELT(dimnames, 1, getAttrib(y, R_NamesSymbol));
+    setAttrib(ans, R_DimNamesSymbol, dimnames);
+
+
    UNPROTECT(3);
+    return ans;
+}
+
+ +

This example introduces several new features. The getAttrib and +setAttrib + + +functions get and set individual attributes. Their second argument is a +SEXP defining the name in the symbol table of the attribute we +want; these and many such symbols are defined in the header file +Rinternals.h. +

+

There are shortcuts here too: the functions namesgets, +dimgets and dimnamesgets are the internal versions of the +default methods of names<-, dim<- and dimnames<- +(for vectors and arrays), and there are functions such as +GetMatrixDimnames and GetArrayDimnames. +

+

What happens if we want to add an attribute that is not pre-defined? We +need to add a symbol for it via a call to + +install. Suppose for illustration we wanted to add an attribute +"version" with value 3.0. We could use +

+
+
    SEXP version;
+    version = PROTECT(allocVector(REALSXP, 1));
+    REAL(version)[0] = 3.0;
+    setAttrib(ans, install("version"), version);
+    UNPROTECT(1);
+
+ +

Using install when it is not needed is harmless and provides a +simple way to retrieve the symbol from the symbol table if it is already +installed. However, the lookup takes a non-trivial amount of time, so +consider code such as +

+
+
static SEXP VerSymbol = NULL;
+...
+    if (VerSymbol == NULL) VerSymbol = install("version");
+
+ +

if it is to be done frequently. +

+

This example can be simplified by another convenience function: +

+
+
    SEXP version = PROTECT(ScalarReal(3.0));
+    setAttrib(ans, install("version"), version);
+    UNPROTECT(1);
+
+ + +
+ + + +

5.9.5 Classes

+ + +

In R the class is just the attribute named "class" so it can +be handled as such, but there is a shortcut classgets. Suppose +we want to give the return value in our example the class "mat". +We can use +

+
+
#include <R.h>
+#include <Rinternals.h>
+      ....
+    SEXP ans, dim, dimnames, class;
+      ....
+    class = PROTECT(allocVector(STRSXP, 1));
+    SET_STRING_ELT(class, 0, mkChar("mat"));
+    classgets(ans, class);
+    UNPROTECT(4);
+    return ans;
+}
+
+ +

As the value is a character vector, we have to know how to create that +from a C character array, which we do using the function +mkChar. +

+
+ + + +

5.9.6 Handling lists

+ + +

Some care is needed with lists, as R moved early on from using +LISP-like lists (now called “pairlists”) to S-like generic vectors. +As a result, the appropriate test for an object of mode list is +isNewList, and we need allocVector(VECSXP, n) and +not allocList(n). +

+

List elements can be retrieved or set by direct access to the elements +of the generic vector. Suppose we have a list object +

+
+
a <- list(f = 1, g = 2, h = 3)
+
+ +

Then we can access a$g as a[[2]] by +

+
+
    double g;
+      ....
+    g = REAL(VECTOR_ELT(a, 1))[0];
+
+ +

This can rapidly become tedious, and the following function (based on +one in package stats) is very useful: +

+
+
/* get the list element named str, or return NULL */
+
+SEXP getListElement(SEXP list, const char *str)
+{
+    SEXP elmt = R_NilValue, names = getAttrib(list, R_NamesSymbol);
+
+
    for (int i = 0; i < length(list); i++)
+	if(strcmp(CHAR(STRING_ELT(names, i)), str) == 0) {
+	   elmt = VECTOR_ELT(list, i);
+	   break;
+	}
+    return elmt;
+}
+
+ +

and enables us to say +

+
+
  double g;
+  g = REAL(getListElement(a, "g"))[0];
+
+ +
+ + + +

5.9.7 Handling character data

+ + +

R character vectors are stored as STRSXPs, a vector type like +VECSXP where every element is of type CHARSXP. The +CHARSXP elements of STRSXPs are accessed using +STRING_ELT and SET_STRING_ELT. +

+

CHARSXPs are read-only objects and must never be modified. In +particular, the C-style string contained in a CHARSXP should be +treated as read-only and for this reason the CHAR function used +to access the character data of a CHARSXP returns (const +char *) (this also allows compilers to issue warnings about improper +use). Since CHARSXPs are immutable, the same CHARSXP can +be shared by any STRSXP needing an element representing the same +string. R maintains a global cache of CHARSXPs so that there +is only ever one CHARSXP representing a given string in memory. +

+ + +

You can obtain a CHARSXP by calling mkChar and providing a +nul-terminated C-style string. This function will return a pre-existing +CHARSXP if one with a matching string already exists, otherwise +it will create a new one and add it to the cache before returning it to +you. The variant mkCharLen can be used to create a +CHARSXP from part of a buffer and will ensure null-termination. +

+

Note that R character strings are restricted to 2^31 - 1 +bytes, and hence so should the input to mkChar be (C allows +longer strings on 64-bit platforms). +

+
+ + + +

5.9.8 Finding and setting variables

+ + + +

It will be usual that all the R objects needed in our C computations +are passed as arguments to .Call or .External, but it is +possible to find the values of R objects from within the C given +their names. The following code is the equivalent of get(name, +envir = rho). +

+
+
SEXP getvar(SEXP name, SEXP rho)
+{
+    SEXP ans;
+
+    if(!isString(name) || length(name) != 1)
+	error("name is not a single string");
+    if(!isEnvironment(rho))
+	error("rho should be an environment");
+    ans = findVar(installChar(STRING_ELT(name, 0)), rho);
+    Rprintf("first value is %f\n", REAL(ans)[0]);
+    return R_NilValue;
+}
+
+ +

The main work is done by + +findVar, but to use it we need to install name as a name +in the symbol table. As we wanted the value for internal use, we return +NULL. +

+

Similar functions with syntax +

+
+
void defineVar(SEXP symbol, SEXP value, SEXP rho)
+void setVar(SEXP symbol, SEXP value, SEXP rho)
+
+ + + +

can be used to assign values to R variables. defineVar +creates a new binding or changes the value of an existing binding in the +specified environment frame; it is the analogue of assign(symbol, +value, envir = rho, inherits = FALSE), but unlike assign, +defineVar does not make a copy of the object +value.108 setVar searches for an existing +binding for symbol in rho or its enclosing environments. +If a binding is found, its value is changed to value. Otherwise, +a new binding with the specified value is created in the global +environment. This corresponds to assign(symbol, value, envir = +rho, inherits = TRUE). +

+
+ + + +

5.9.9 Some convenience functions

+ +

Some operations are done so frequently that there are convenience +functions to handle them. (All these are provided via the header file +Rinternals.h.) +

+

Suppose we wanted to pass a single logical argument +ignore_quotes: we could use +

+
+
    int ign = asLogical(ignore_quotes);
+    if(ign == NA_LOGICAL) error("'ignore_quotes' must be TRUE or FALSE");
+
+ +

which will do any coercion needed (at least from a vector argument), and +return NA_LOGICAL if the value passed was NA or coercion +failed. There are also asInteger, asReal and +asComplex. The function asChar returns a CHARSXP. +All of these functions ignore any elements of an input vector after the +first. +

+

To return a length-one real vector we can use +

+
+
    double x;
+
+    ...
+    return ScalarReal(x);
+
+ +

and there are versions of this for all the atomic vector types (those for +a length-one character vector being ScalarString with argument a +CHARSXP and mkString with argument const char *). +

+

Some of the isXXXX functions differ from their apparent +R-level counterparts: for example isVector is true for any +atomic vector type (isVectorAtomic) and for lists and expressions +(isVectorList) (with no check on attributes). isMatrix is +a test of a length-2 "dim" attribute. +

+

There are a series of small macros/functions to help construct pairlists +and language objects (whose internal structures just differ by +SEXPTYPE). Function CONS(u, v) is the basic building +block: it constructs a pairlist from u followed by v +(which is a pairlist or R_NilValue). LCONS is a variant +that constructs a language object. Functions list1 to +list5 construct a pairlist from one to five items, and +lang1 to lang6 do the same for a language object (a +function to call plus zero to five arguments). Functions elt and +lastElt find the ith element and the last element of a +pairlist, and nthcdr returns a pointer to the nth position +in the pairlist (whose CAR is the nth item). +

+

Functions str2type and type2str map R +length-one character strings to and from SEXPTYPE numbers, and +type2char maps numbers to C character strings. +

+ + + + +
+ + + +

5.9.9.1 Semi-internal convenience functions

+ +

There is quite a collection of functions that may be used in your C code +if you are willing to adapt to rare “API” changes. +These typically contain “workhorses” of their R counterparts. +

+

Functions any_duplicated and any_duplicated3 are fast +versions of R’s any(duplicated(.)). +

+

Function R_compute_identical corresponds to R’s identical function. +

+ +
+ + + +

5.9.10 Named objects and copying

+ + + +

When assignments are done in R such as +

+
+
x <- 1:10
+y <- x
+
+ +

the named object is not necessarily copied, so after those two +assignments y and x are bound to the same SEXPREC +(the structure a SEXP points to). This means that any code which +alters one of them has to make a copy before modifying the copy if the +usual R semantics are to apply. Note that whereas .C and +.Fortran do copy their arguments (unless the dangerous dup += FALSE is used), .Call and .External do not. So +duplicate is commonly called on arguments to .Call before +modifying them. +

+

However, at least some of this copying is unneeded. In the first +assignment shown, x <- 1:10, R first creates an object with +value 1:10 and then assigns it to x but if x is +modified no copy is necessary as the temporary object with value +1:10 cannot be referred to again. R distinguishes between +named and unnamed objects via a field in a SEXPREC that +can be accessed via the macros NAMED and SET_NAMED. This +can take values +

+
+
0
+

The object is not bound to any symbol +

+
1
+

The object has been bound to exactly one symbol +

+
2
+

The object has potentially been bound to two or more symbols, and one +should act as if another variable is currently bound to this value. +

+
+ +

Note the past tenses: R does not do full reference counting and there +may currently be fewer bindings. +

+

It is safe to modify the value of any SEXP for which +NAMED(foo) is zero, and if NAMED(foo) is two, the value +should be duplicated (via a call to duplicate) before any +modification. Note that it is the responsibility of the author of the +code making the modification to do the duplication, even if it is +x whose value is being modified after y <- x. +

+

The case NAMED(foo) == 1 allows some optimization, but it can be +ignored (and duplication done whenever NAMED(foo) > 0). (This +optimization is not currently usable in user code.) It is intended +for use within replacement functions. Suppose we used +

+
+
x <- 1:10
+foo(x) <- 3
+
+ +

which is computed as +

+
+
x <- 1:10
+x <- "foo<-"(x, 3)
+
+ +

Then inside "foo<-" the object pointing to the current value of +x will have NAMED(foo) as one, and it would be safe to +modify it as the only symbol bound to it is x and that will be +rebound immediately. (Provided the remaining code in "foo<-" +make no reference to x, and no one is going to attempt a direct +call such as y <- "foo<-"(x).) +

+

This mechanism is likely to be replaced in future versions of R. +

+ +
+ + + +

5.10 Interface functions .Call and .External

+ + +

In this section we consider the details of the R/C interfaces. +

+

These two interfaces have almost the same functionality. .Call is +based on the interface of the same name in S version 4, and +.External is based on R’s .Internal. .External +is more complex but allows a variable number of arguments. +

+ + + + + + +
+ + + +

5.10.1 Calling .Call

+ + + +

Let us convert our finite convolution example to use .Call. The +calling function in R is +

+
+
conv <- function(a, b) .Call("convolve2", a, b)
+
+ +

which could hardly be simpler, but as we shall see all the type +coercion is transferred to the C code, which is +

+
+
#include <R.h>
+#include <Rinternals.h>
+
+SEXP convolve2(SEXP a, SEXP b)
+{
+    int na, nb, nab;
+    double *xa, *xb, *xab;
+    SEXP ab;
+
+    a = PROTECT(coerceVector(a, REALSXP));
+    b = PROTECT(coerceVector(b, REALSXP));
+    na = length(a); nb = length(b); nab = na + nb - 1;
+    ab = PROTECT(allocVector(REALSXP, nab));
+    xa = REAL(a); xb = REAL(b); xab = REAL(ab);
+    for(int i = 0; i < nab; i++) xab[i] = 0.0;
+    for(int i = 0; i < na; i++)
+	for(int j = 0; j < nb; j++) xab[i + j] += xa[i] * xb[j];
+    UNPROTECT(3);
+    return ab;
+}
+
+ +
+ + + +

5.10.2 Calling .External

+ + + +

We can use the same example to illustrate .External. The R +code changes only by replacing .Call by .External +

+
+
conv <- function(a, b) .External("convolveE", a, b)
+
+ +

but the main change is how the arguments are passed to the C code, this +time as a single SEXP. The only change to the C code is how we handle +the arguments. +

+
+
#include <R.h>
+#include <Rinternals.h>
+
+SEXP convolveE(SEXP args)
+{
+    int i, j, na, nb, nab;
+    double *xa, *xb, *xab;
+    SEXP a, b, ab;
+
+    a = PROTECT(coerceVector(CADR(args), REALSXP));
+    b = PROTECT(coerceVector(CADDR(args), REALSXP));
+    ...
+}
+
+ +

Once again we do not need to protect the arguments, as in the R side +of the interface they are objects that are already in use. The macros +

+
+
  first = CADR(args);
+  second = CADDR(args);
+  third = CADDDR(args);
+  fourth = CAD4R(args);
+
+ +

provide convenient ways to access the first four arguments. More +generally we can use the + + +CDR and CAR macros as in +

+
+
  args = CDR(args); a = CAR(args);
+  args = CDR(args); b = CAR(args);
+
+ +

which clearly allows us to extract an unlimited number of arguments +(whereas .Call has a limit, albeit at 65 not a small one). +

+

More usefully, the .External interface provides an easy way to +handle calls with a variable number of arguments, as length(args) +will give the number of arguments supplied (of which the first is +ignored). We may need to know the names (‘tags’) given to the actual +arguments, which we can by using the TAG macro and using +something like the following example, that prints the names and the first +value of its arguments if they are vector types. +

+
+
SEXP showArgs(SEXP args)
+{
+    args = CDR(args); /* skip ‘name’ */
+    for(int i = 0; args != R_NilValue; i++, args = CDR(args)) {
+	const char *name =
+	    isNull(TAG(args)) ? "" : CHAR(PRINTNAME(TAG(args)));
+	SEXP el = CAR(args);
+	if (length(el) == 0) {
+	    Rprintf("[%d] ‘%s’ R type, length 0\n", i+1, name);
+	   continue;
+	}
+
	switch(TYPEOF(el)) {
+	case REALSXP:
+	    Rprintf("[%d] ‘%s’ %f\n", i+1, name, REAL(el)[0]);
+	    break;
+
	case LGLSXP:
+	case INTSXP:
+	    Rprintf("[%d] ‘%s’ %d\n", i+1, name, INTEGER(el)[0]);
+	    break;
+
	case CPLXSXP:
+	{
+	    Rcomplex cpl = COMPLEX(el)[0];
+	    Rprintf("[%d] ‘%s’ %f + %fi\n", i+1, name, cpl.r, cpl.i);
+	}
+	    break;
+
	case STRSXP:
+	    Rprintf("[%d] ‘%s’ %s\n", i+1, name,
+		   CHAR(STRING_ELT(el, 0)));
+	   break;
+
	default:
+	    Rprintf("[%d] ‘%s’ R type\n", i+1, name);
+       }
+    }
+    return R_NilValue;
+}
+
+ +

This can be called by the wrapper function +

+
+
showArgs <- function(...) invisible(.External("showArgs", ...))
+
+ +

Note that this style of programming is convenient but not necessary, as +an alternative style is +

+
+
showArgs1 <- function(...) invisible(.Call("showArgs1", list(...)))
+
+ +

The (very similar) C code is in the scripts. +

+
+ + + +

5.10.3 Missing and special values

+ + + +

One piece of error-checking the .C call does (unless NAOK +is true) is to check for missing (NA) and IEEE special +values (Inf, -Inf and NaN) and give an error if any +are found. With the .Call interface these will be passed to our +code. In this example the special values are no problem, as +IEC60559 arithmetic will handle them correctly. In the current +implementation this is also true of NA as it is a type of +NaN, but it is unwise to rely on such details. Thus we will +re-write the code to handle NAs using macros defined in +R_ext/Arith.h included by R.h. +

+

The code changes are the same in any of the versions of convolve2 +or convolveE: +

+
+
    ...
+  for(int i = 0; i < na; i++)
+    for(int j = 0; j < nb; j++)
+	if(ISNA(xa[i]) || ISNA(xb[j]) || ISNA(xab[i + j]))
+	    xab[i + j] = NA_REAL;
+	else
+	    xab[i + j] += xa[i] * xb[j];
+    ...
+
+ + + + +

Note that the ISNA macro, and the similar macros ISNAN +(which checks for NaN or NA) and R_FINITE (which is +false for NA and all the special values), only apply to numeric +values of type double. Missingness of integers, logicals and +character strings can be tested by equality to the constants +NA_INTEGER, NA_LOGICAL and NA_STRING. These and +NA_REAL can be used to set elements of R vectors to NA. +

+

The constants R_NaN, R_PosInf and R_NegInf can be +used to set doubles to the special values. +

+
+ + + +

5.11 Evaluating R expressions from C

+ + +

The main function we will use is +

+
+
SEXP eval(SEXP expr, SEXP rho);
+
+ +

the equivalent of the interpreted R code eval(expr, envir = +rho) (so rho must be an environment), although we can also make +use of findVar, defineVar and findFun (which +restricts the search to functions). +

+

To see how this might be applied, here is a simplified internal version +of lapply for expressions, used as +

+
+
a <- list(a = 1:5, b = rnorm(10), test = runif(100))
+.Call("lapply", a, quote(sum(x)), new.env())
+
+ +

with C code +

+
+
SEXP lapply(SEXP list, SEXP expr, SEXP rho)
+{
+    int n = length(list);
+    SEXP ans;
+
+    if(!isNewList(list)) error("'list' must be a list");
+    if(!isEnvironment(rho)) error("'rho' should be an environment");
+    ans = PROTECT(allocVector(VECSXP, n));
+    for(int i = 0; i < n; i++) {
+	defineVar(install("x"), VECTOR_ELT(list, i), rho);
+	SET_VECTOR_ELT(ans, i, eval(expr, rho));
+    }
+    setAttrib(ans, R_NamesSymbol, getAttrib(list, R_NamesSymbol));
+    UNPROTECT(1);
+    return ans;
+}
+
+ +

It would be closer to lapply if we could pass in a function +rather than an expression. One way to do this is via interpreted +R code as in the next example, but it is possible (if somewhat +obscure) to do this in C code. The following is based on the code in +src/main/optimize.c. +

+
+
SEXP lapply2(SEXP list, SEXP fn, SEXP rho)
+{
+    int n = length(list);
+    SEXP R_fcall, ans;
+
+    if(!isNewList(list)) error("'list' must be a list");
+    if(!isFunction(fn)) error("'fn' must be a function");
+    if(!isEnvironment(rho)) error("'rho' should be an environment");
+    R_fcall = PROTECT(lang2(fn, R_NilValue));
+    ans = PROTECT(allocVector(VECSXP, n));
+    for(int i = 0; i < n; i++) {
+	SETCADR(R_fcall, VECTOR_ELT(list, i));
+	SET_VECTOR_ELT(ans, i, eval(R_fcall, rho));
+    }
+    setAttrib(ans, R_NamesSymbol, getAttrib(list, R_NamesSymbol));
+    UNPROTECT(2);
+    return ans;
+}
+
+ +

used by +

+
+
.Call("lapply2", a, sum, new.env())
+
+ +

Function lang2 creates an executable pairlist of two elements, but +this will only be clear to those with a knowledge of a LISP-like +language. +

+

As a more comprehensive example of constructing an R call in C code +and evaluating, consider the following fragment of +printAttributes in src/main/print.c. +

+
+
    /* Need to construct a call to
+       print(CAR(a), digits=digits)
+       based on the R_print structure, then eval(call, env).
+       See do_docall for the template for this sort of thing.
+    */
+    SEXP s, t;
+    t = s = PROTECT(allocList(3));
+    SET_TYPEOF(s, LANGSXP);
+    SETCAR(t, install("print")); t = CDR(t);
+    SETCAR(t,  CAR(a)); t = CDR(t);
+    SETCAR(t, ScalarInteger(digits));
+    SET_TAG(t, install("digits"));
+    eval(s, env);
+    UNPROTECT(1);
+
+ +

At this point CAR(a) is the R object to be printed, the +current attribute. There are three steps: the call is constructed as +a pairlist of length 3, the list is filled in, and the expression +represented by the pairlist is evaluated. +

+

A pairlist is quite distinct from a generic vector list, the only +user-visible form of list in R. A pairlist is a linked list (with +CDR(t) computing the next entry), with items (accessed by +CAR(t)) and names or tags (set by SET_TAG). In this call +there are to be three items, a symbol (pointing to the function to be +called) and two argument values, the first unnamed and the second named. +Setting the type to LANGSXP makes this a call which can be evaluated. +

+ + + + + +
+ + + +

5.11.1 Zero-finding

+ + +

In this section we re-work the example of Becker, Chambers & Wilks (1988, +pp.~205–10) on finding a zero of a univariate function. The R code +and an example are +

+
+
zero <- function(f, guesses, tol = 1e-7) {
+    f.check <- function(x) {
+	x <- f(x)
+	if(!is.numeric(x)) stop("Need a numeric result")
+	as.double(x)
+    }
+    .Call("zero", body(f.check), as.double(guesses), as.double(tol),
+	  new.env())
+}
+
+cube1 <- function(x) (x^2 + 1) * (x - 1.5)
+zero(cube1, c(0, 5))
+
+ +

where this time we do the coercion and error-checking in the R code. +The C code is +

+
+
SEXP mkans(double x)
+{
+    // no need for PROTECT() here, as REAL(.) does not allocate:
+    SEXP ans = allocVector(REALSXP, 1);
+    REAL(ans)[0] = x;
+    return ans;
+}
+
+
double feval(double x, SEXP f, SEXP rho)
+{
+    // a version with (too) much PROTECT()ion .. "better safe than sorry"
+    SEXP symbol, value;
+    PROTECT(symbol = install("x"));
+    PROTECT(value = mkans(x));
+    defineVar(symbol, value, rho);
+    UNPROTECT(2);
+    return(REAL(eval(f, rho))[0]);
+}
+
+
SEXP zero(SEXP f, SEXP guesses, SEXP stol, SEXP rho)
+{
+    double x0 = REAL(guesses)[0], x1 = REAL(guesses)[1],
+	   tol = REAL(stol)[0];
+    double f0, f1, fc, xc;
+
+
    if(tol <= 0.0) error("non-positive tol value");
+    f0 = feval(x0, f, rho); f1 = feval(x1, f, rho);
+    if(f0 == 0.0) return mkans(x0);
+    if(f1 == 0.0) return mkans(x1);
+    if(f0*f1 > 0.0) error("x[0] and x[1] have the same sign");
+
+
    for(;;) {
+	xc = 0.5*(x0+x1);
+	if(fabs(x0-x1) < tol) return  mkans(xc);
+	fc = feval(xc, f, rho);
+	if(fc == 0) return  mkans(xc);
+	if(f0*fc > 0.0) {
+	    x0 = xc; f0 = fc;
+	} else {
+	    x1 = xc; f1 = fc;
+	}
+    }
+}
+
+ +
+ + + +

5.11.2 Calculating numerical derivatives

+ + +

We will use a longer example (by Saikat DebRoy) to illustrate the use of +evaluation and .External. This calculates numerical derivatives, +something that could be done as effectively in interpreted R code but +may be needed as part of a larger C calculation. +

+

An interpreted R version and an example are +

+
+
numeric.deriv <- function(expr, theta, rho=sys.frame(sys.parent()))
+{
+    eps <- sqrt(.Machine$double.eps)
+    ans <- eval(substitute(expr), rho)
+    grad <- matrix(, length(ans), length(theta),
+		   dimnames=list(NULL, theta))
+    for (i in seq_along(theta)) {
+	old <- get(theta[i], envir=rho)
+	delta <- eps * max(1, abs(old))
+	assign(theta[i], old+delta, envir=rho)
+	ans1 <- eval(substitute(expr), rho)
+	assign(theta[i], old, envir=rho)
+	grad[, i] <- (ans1 - ans)/delta
+    }
+    attr(ans, "gradient") <- grad
+    ans
+}
+omega <- 1:5; x <- 1; y <- 2
+numeric.deriv(sin(omega*x*y), c("x", "y"))
+
+ +

where expr is an expression, theta a character vector of +variable names and rho the environment to be used. +

+

For the compiled version the call from R will be +

+
+
.External("numeric_deriv", expr, theta, rho)
+
+ +

with example usage +

+
+
.External("numeric_deriv", quote(sin(omega*x*y)),
+	  c("x", "y"), .GlobalEnv)
+
+ +

Note the need to quote the expression to stop it being evaluated in the +caller. +

+

Here is the complete C code which we will explain section by section. +

+
+
#include <R.h> /* for DOUBLE_EPS */
+#include <Rinternals.h>
+
+SEXP numeric_deriv(SEXP args)
+{
+    SEXP theta, expr, rho, ans, ans1, gradient, par, dimnames;
+    double tt, xx, delta, eps = sqrt(DOUBLE_EPS), *rgr, *rans;
+    int i, start;
+
+
    expr = CADR(args);
+    if(!isString(theta = CADDR(args)))
+	error("theta should be of type character");
+    if(!isEnvironment(rho = CADDDR(args)))
+	error("rho should be an environment");
+
+
    ans = PROTECT(coerceVector(eval(expr, rho), REALSXP));
+    gradient = PROTECT(allocMatrix(REALSXP, LENGTH(ans), LENGTH(theta)));
+    rgr = REAL(gradient); rans = REAL(ans);
+
+
    for(i = 0, start = 0; i < LENGTH(theta); i++, start += LENGTH(ans)) {
+	par = PROTECT(findVar(installChar(STRING_ELT(theta, i)), rho));
+	tt = REAL(par)[0];
+	xx = fabs(tt);
+	delta = (xx < 1) ? eps : xx*eps;
+	REAL(par)[0] += delta;
+	ans1 = PROTECT(coerceVector(eval(expr, rho), REALSXP));
+	for(int j = 0; j < LENGTH(ans); j++)
+	    rgr[j + start] = (REAL(ans1)[j] - rans[j])/delta;
+	REAL(par)[0] = tt;
+	UNPROTECT(2); /* par, ans1 */
+    }
+
+
    dimnames = PROTECT(allocVector(VECSXP, 2));
+    SET_VECTOR_ELT(dimnames, 1,  theta);
+    dimnamesgets(gradient, dimnames);
+    setAttrib(ans, install("gradient"), gradient);
+    UNPROTECT(3); /* ans  gradient  dimnames */
+    return ans;
+}
+
+ +

The code to handle the arguments is +

+
+
    expr = CADR(args);
+    if(!isString(theta = CADDR(args)))
+	error("theta should be of type character");
+    if(!isEnvironment(rho = CADDDR(args)))
+	error("rho should be an environment");
+
+ +

Note that we check for correct types of theta and rho but +do not check the type of expr. That is because eval can +handle many types of R objects other than EXPRSXP. There is +no useful coercion we can do, so we stop with an error message if the +arguments are not of the correct mode. +

+

The first step in the code is to evaluate the expression in the +environment rho, by +

+
+
    ans = PROTECT(coerceVector(eval(expr, rho), REALSXP));
+
+ +

We then allocate space for the calculated derivative by +

+
+
    gradient = PROTECT(allocMatrix(REALSXP, LENGTH(ans), LENGTH(theta)));
+
+ +

The first argument to allocMatrix gives the SEXPTYPE of +the matrix: here we want it to be REALSXP. The other two +arguments are the numbers of rows and columns. (Note that LENGTH +is intended to be used for vectors: length is more generally +applicable.) +

+
+
    for(i = 0, start = 0; i < LENGTH(theta); i++, start += LENGTH(ans)) {
+	par = PROTECT(findVar(installChar(STRING_ELT(theta, i)), rho));
+
+ +

Here, we are entering a for loop. We loop through each of the +variables. In the for loop, we first create a symbol +corresponding to the i’th element of the STRSXP +theta. Here, STRING_ELT(theta, i) accesses the +i’th element of the STRSXP theta. Macro +CHAR() extracts the actual character +representation109 of it: it returns a pointer. We then +install the name and use findVar to find its value. +

+
+
	tt = REAL(par)[0];
+	xx = fabs(tt);
+	delta = (xx < 1) ? eps : xx*eps;
+	REAL(par)[0] += delta;
+	ans1 = PROTECT(coerceVector(eval(expr, rho), REALSXP));
+
+ +

We first extract the real value of the parameter, then calculate +delta, the increment to be used for approximating the numerical +derivative. Then we change the value stored in par (in +environment rho) by delta and evaluate expr in +environment rho again. Because we are directly dealing with +original R memory locations here, R does the evaluation for the +changed parameter value. +

+
+
	for(int j = 0; j < LENGTH(ans); j++)
+	    rgr[j + start] = (REAL(ans1)[j] - rans[j])/delta;
+	REAL(par)[0] = tt;
+	UNPROTECT(2);
+    }
+
+ +

Now, we compute the i’th column of the gradient matrix. Note how +it is accessed: R stores matrices by column (like FORTRAN). +

+
+
    dimnames = PROTECT(allocVector(VECSXP, 2));
+    SET_VECTOR_ELT(dimnames, 1, theta);
+    dimnamesgets(gradient, dimnames);
+    setAttrib(ans, install("gradient"), gradient);
+    UNPROTECT(3);
+    return ans;
+}
+
+ +

First we add column names to the gradient matrix. This is done by +allocating a list (a VECSXP) whose first element, the row names, +is NULL (the default) and the second element, the column names, +is set as theta. This list is then assigned as the attribute +having the symbol R_DimNamesSymbol. Finally we set the gradient +matrix as the gradient attribute of ans, unprotect the remaining +protected locations and return the answer ans. +

+
+ + + +

5.12 Parsing R code from C

+ + +

Suppose an R extension want to accept an R expression from the +user and evaluate it. The previous section covered evaluation, but the +expression will be entered as text and needs to be parsed first. A +small part of R’s parse interface is declared in header file +R_ext/Parse.h110. +

+

An example of the usage can be found in the (example) Windows package +windlgs included in the R source tree. The essential part is +

+
+
#include <R.h>
+#include <Rinternals.h>
+#include <R_ext/Parse.h>
+
+SEXP menu_ttest3()
+{
+    char cmd[256];
+    SEXP cmdSexp, cmdexpr, ans = R_NilValue;
+    ParseStatus status;
+   ...
+    if(done == 1) {
+	cmdSexp = PROTECT(allocVector(STRSXP, 1));
+	SET_STRING_ELT(cmdSexp, 0, mkChar(cmd));
+	cmdexpr = PROTECT(R_ParseVector(cmdSexp, -1, &status, R_NilValue));
+	if (status != PARSE_OK) {
+	    UNPROTECT(2);
+	    error("invalid call %s", cmd);
+	}
+	/* Loop is needed here as EXPSEXP will be of length > 1 */
+	for(int i = 0; i < length(cmdexpr); i++)
+	    ans = eval(VECTOR_ELT(cmdexpr, i), R_GlobalEnv);
+	UNPROTECT(2);
+    }
+    return ans;
+}
+
+

Note that a single line of text may give rise to more than one R +expression. +

+ +

R_ParseVector is essentially the code used to implement +parse(text=) at R level. The first argument is a character +vector (corresponding to text) and the second the maximal +number of expressions to parse (corresponding to n). The third +argument is a pointer to a variable of an enumeration type, and it is +normal (as parse does) to regard all values other than +PARSE_OK as an error. Other values which might be returned are +PARSE_INCOMPLETE (an incomplete expression was found) and +PARSE_ERROR (a syntax error), in both cases the value returned +being R_NilValue. The fourth argument is a length one character +vector to be used as a filename in error messages, a srcfile +object or the R NULL object (as in the example above). If a +srcfile object was used, a srcref attribute would be +attached to the result, containing a list of srcref objects of +the same length as the expression, to allow it to be echoed with its +original formatting. +

+ + + + +
+ + + +

5.12.1 Accessing source references

+ +

The source references added by the parser are recorded by R’s evaluator +as it evaluates code. Two functions +make these available to debuggers running C code: + + + +

+
+
SEXP R_GetCurrentSrcref(int skip);
+
+ +

This function checks R_Srcref and the current evaluation stack +for entries that contain source reference information. The +skip argument tells how many source references to skip before +returning the SEXP of the srcref object, counting from +the top of the stack. If skip < 0, abs(skip) locations +are counted up from the bottom of the stack. If too few or no source +references are found, NULL is returned. +

+
+
SEXP R_GetSrcFilename(SEXP srcref);
+
+ +

This function extracts the filename from the source reference for +display, returning a length 1 character vector containing the +filename. If no name is found, "" is returned. +

+
+ + + +

5.13 External pointers and weak references

+ +

The SEXPTYPEs EXTPTRSXP and WEAKREFSXP can be +encountered at R level, but are created in C code. +

+ +

External pointer SEXPs are intended to handle references to C +structures such as ‘handles’, and are used for this purpose in package +RODBC for example. They are unusual in their copying semantics in +that when an R object is copied, the external pointer object is not +duplicated. (For this reason external pointers should only be used as +part of an object with normal semantics, for example an attribute or an +element of a list.) +

+

An external pointer is created by +

+
+
SEXP R_MakeExternalPtr(void *p, SEXP tag, SEXP prot);
+
+ +

where p is the pointer (and hence this cannot portably be a +function pointer), and tag and prot are references to +ordinary R objects which will remain in existence (be protected from +garbage collection) for the lifetime of the external pointer object. A +useful convention is to use the tag field for some form of type +identification and the prot field for protecting the memory that +the external pointer represents, if that memory is allocated from the +R heap. Both tag and prot can be R_NilValue, +and often are. +

+

The elements of an external pointer can be accessed and set via +

+
+
void *R_ExternalPtrAddr(SEXP s);
+SEXP R_ExternalPtrTag(SEXP s);
+SEXP R_ExternalPtrProtected(SEXP s);
+void R_ClearExternalPtr(SEXP s);
+void R_SetExternalPtrAddr(SEXP s, void *p);
+void R_SetExternalPtrTag(SEXP s, SEXP tag);
+void R_SetExternalPtrProtected(SEXP s, SEXP p);
+
+ +

Clearing a pointer sets its value to the C NULL pointer. +

+ +

An external pointer object can have a finalizer, a piece of code +to be run when the object is garbage collected. This can be R code +or C code, and the various interfaces are, respectively. +

+
+
void R_RegisterFinalizerEx(SEXP s, SEXP fun, Rboolean onexit);
+
+typedef void (*R_CFinalizer_t)(SEXP);
+void R_RegisterCFinalizerEx(SEXP s, R_CFinalizer_t fun, Rboolean onexit);
+
+ +

The R function indicated by fun should be a function of a +single argument, the object to be finalized. R does not perform a +garbage collection when shutting down, and the onexit argument of +the extended forms can be used to ask that the finalizer be run during a +normal shutdown of the R session. It is suggested that it is good +practice to clear the pointer on finalization. +

+

The only R level function for interacting with external pointers is +reg.finalizer which can be used to set a finalizer. +

+

It is probably not a good idea to allow an external pointer to be +saved and then reloaded, but if this happens the pointer will be +set to the C NULL pointer. +

+

Finalizers can be run at many places in the code base and much of it, +including the R interpreter, is not re-entrant. So great care is +needed in choosing the code to be run in a finalizer. As from R 3.0.3 +finalizers are marked to be run at garbage collection but only run at a +somewhat safe point thereafter. +

+ +

Weak references are used to allow the programmer to maintain information +on entities without preventing the garbage collection of the entities +once they become unreachable. +

+

A weak reference contains a key and a value. The value is reachable is +if it either reachable directly or via weak references with reachable +keys. Once a value is determined to be unreachable during garbage +collection, the key and value are set to R_NilValue and the +finalizer will be run later in the garbage collection. +

+

Weak reference objects are created by one of +

+
+
SEXP R_MakeWeakRef(SEXP key, SEXP val, SEXP fin, Rboolean onexit);
+SEXP R_MakeWeakRefC(SEXP key, SEXP val, R_CFinalizer_t fin,
+		    Rboolean onexit);
+
+ +

where the R or C finalizer are specified in exactly the same way as +for an external pointer object (whose finalization interface is +implemented via weak references). +

+

The parts can be accessed via +

+
+
SEXP R_WeakRefKey(SEXP w);
+SEXP R_WeakRefValue(SEXP w);
+void R_RunWeakRefFinalizer(SEXP w);
+
+ +

A toy example of the use of weak references can be found at +homepage.stat.uiowa.edu/~luke/R/references/weakfinex.html, +but that is used to add finalizers to external pointers which can now be +done more directly. At the time of writing no CRAN or +Bioconductor package uses weak references. +

+ + + + + +
+ + + +

5.13.1 An example

+ +

Package RODBC uses external pointers to maintain its +channels, connections to databases. There can be several +connections open at once, and the status information for each is stored +in a C structure (pointed to by this_handle) in the code extract +below) that is returned via an external pointer as part of the RODBC +‘channel’ (as the "handle_ptr" attribute). The external pointer +is created by +

+
+
    SEXP ans, ptr;
+    ans = PROTECT(allocVector(INTSXP, 1));
+    ptr = R_MakeExternalPtr(thisHandle, install("RODBC_channel"), R_NilValue);
+    PROTECT(ptr);
+    R_RegisterCFinalizerEx(ptr, chanFinalizer, TRUE);
+	    ...
+    /* return the channel no */
+    INTEGER(ans)[0] = nChannels;
+    /* and the connection string as an attribute */
+    setAttrib(ans, install("connection.string"), constr);
+    setAttrib(ans, install("handle_ptr"), ptr);
+    UNPROTECT(3);
+    return ans;
+
+ +

Note the symbol given to identify the usage of the external pointer, and +the use of the finalizer. Since the final argument when registering the +finalizer is TRUE, the finalizer will be run at the of the +R session (unless it crashes). This is used to close and clean up +the connection to the database. The finalizer code is simply +

+
+
static void chanFinalizer(SEXP ptr)
+{
+    if(!R_ExternalPtrAddr(ptr)) return;
+    inRODBCClose(R_ExternalPtrAddr(ptr));
+    R_ClearExternalPtr(ptr); /* not really needed */
+}
+
+ +

Clearing the pointer and checking for a NULL pointer avoids any +possibility of attempting to close an already-closed channel. +

+

R’s connections provide another example of using external pointers, +in that case purely to be able to use a finalizer to close and destroy the +connection if it is no longer is use. +

+
+ + + +

5.14 Vector accessor functions

+ +

The vector accessors like REAL and INTEGER and +VECTOR_ELT are functions when used in R extensions. +(For efficiency they are macros when used in the R source code, apart +from SET_STRING_ELT and SET_VECTOR_ELT which are always +functions.) +

+

The accessor functions check that they are being used on an appropriate +type of SEXP. +

+

If efficiency is essential, the macro versions of the accessors can be +obtained by defining ‘USE_RINTERNALS’ before including +Rinternals.h. If you find it necessary to do so, please do test +that your code compiles without ‘USE_RINTERNALS’ defined, as this +provides a stricter test that the accessors have been used correctly. +Note too that the use of ‘USE_RINTERNALS’ when the header is +included in C++ code is not supported: doing so uses C99 features which +are not necessarily in C++. +

+
+ + + +

5.15 Character encoding issues

+ + + +

CHARSXPs can be marked as coming from a known encoding (Latin-1 +or UTF-8). This is mainly intended for human-readable output, and most +packages can just treat such CHARSXPs as a whole. However, if +they need to be interpreted as characters or output at C level then it +would normally be correct to ensure that they are converted to the +encoding of the current locale: this can be done by accessing the data +in the CHARSXP by translateChar rather than by +CHAR. If re-encoding is needed this allocates memory with +R_alloc which thus persists to the end of the +.Call/.External call unless vmaxset is used +(see Transient storage allocation). +

+

There is a similar function translateCharUTF8 which converts to +UTF-8: this has the advantage that a faithful translation is almost +always possible (whereas only a few languages can be represented in the +encoding of the current locale unless that is UTF-8). +

+ + +

There is a public interface to the encoding marked on CHARXSXPs +via +

+
+
typedef enum {CE_NATIVE, CE_UTF8, CE_LATIN1, CE_SYMBOL, CE_ANY} cetype_t;
+cetype_t getCharCE(SEXP);
+SEXP mkCharCE(const char *, cetype_t);
+
+ +

Only CE_UTF8 and CE_LATIN1 are marked on CHARSXPs +(and so Rf_getCharCE will only return one of the first three), +and these should only be used on non-ASCII strings. Value +CE_SYMBOL is used internally to indicate Adobe Symbol encoding. +Value CE_ANY is used to indicate a character string that will not +need re-encoding – this is used for character strings known to be in +ASCII, and can also be used as an input parameter where the +intention is that the string is treated as a series of bytes. (See the +comments under mkChar about the length of input allowed.) +

+

Function +

+ +
+
const char *reEnc(const char *x, cetype_t ce_in, cetype_t ce_out,
+		  int subst);
+
+ +

can be used to re-encode character strings: like translateChar it +returns a string allocated by R_alloc. This can translate from +CE_SYMBOL to CE_UTF8, but not conversely. Argument +subst controls what to do with untranslatable characters or +invalid input: this is done byte-by-byte with 1 indicates to +output hex of the form <a0>, and 2 to replace by ., +with any other value causing the byte to produce no output. +

+ +

There is also +

+
+
SEXP mkCharLenCE(const char *, size_t, cetype_t);
+
+ +

to create marked character strings of a given length. +

+ +
+ + + +

6 The R API: entry points for C code

+ + + + + + + + + + + + + + + + + + + + + +

There are a large number of entry points in the R executable/DLL that +can be called from C code (and some that can be called from FORTRAN +code). Only those documented here are stable enough that they will only +be changed with considerable notice. +

+

The recommended procedure to use these is to include the header file +R.h in your C code by +

+
+
#include <R.h>
+
+ +

This will include several other header files from the directory +R_INCLUDE_DIR/R_ext, and there are other header files +there that can be included too, but many of the features they contain +should be regarded as undocumented and unstable. +

+

An alternative is to include the header file S.h, which may be +useful when porting code from S. This includes rather less than +R.h, and has some extra compatibility definitions (for example +the S_complex type from S). +

+

The defines used for compatibility with S sometimes causes +conflicts (notably with Windows headers), and the known +problematic defines can be removed by defining STRICT_R_HEADERS. +

+

Most of these header files, including all those included by R.h, +can be used from C++ code. Some others need to be included within an +extern "C" declaration, and for clarity this is advisable for all +R header files. +

+
+

Note: Because R re-maps many of its external names to avoid clashes with +user code, it is essential to include the appropriate header +files when using these entry points. +

+ +

This remapping can cause problems111, and can be eliminated by defining R_NO_REMAP and +prepending ‘Rf_’ to all the function names used from +Rinternals.h and R_ext/Error.h. These problems can +usually be avoided by including other headers (such as system headers +and those for external software used by the package) before R.h. +

+

We can classify the entry points as +

+
+
API
+

Entry points which are documented in this manual and declared in an +installed header file. These can be used in distributed packages and +will only be changed after deprecation. +

+
+
public
+

Entry points declared in an installed header file that are exported +on all R platforms but are not documented and subject to change +without notice. +

+
+
private
+

Entry points that are used when building R and exported on all R +platforms but are not declared in the installed header files. +Do not use these in distributed code. +

+
+
hidden
+

Entry points that are where possible (Windows and some modern Unix-alike +compilers/loaders when using R as a shared library) not exported. +

+
+ +
+ +
+

+Next: , Previous: , Up: The R API   [Contents][Index]

+
+ +

6.1 Memory allocation

+ + + + + + + +

There are two types of memory allocation available to the C programmer, +one in which R manages the clean-up and the other in which user +has full control (and responsibility). +

+
+ + + +

6.1.1 Transient storage allocation

+ + + + + + + +

Here R will reclaim the memory at the end of the call to .C, +.Call or .External. Use +

+
+
char *R_alloc(size_t n, int size)
+
+ +

which allocates n units of size bytes each. A typical usage +(from package stats) is +

+
+
x = (int *) R_alloc(nrows(merge)+2, sizeof(int));
+
+ +

(size_t is defined in stddef.h which the header defining +R_alloc includes.) +

+

There is a similar call, S_alloc (for compatibility with older +versions of S) which zeroes the memory allocated, +

+
+
char *S_alloc(long n, int size)
+
+ +

and +

+
+
char *S_realloc(char *p, long new, long old, int size)
+
+ +

which changes the allocation size from old to new units, and +zeroes the additional units. +

+

For compatibility with current versions of S, header S.h +(only) defines wrapper macros equivalent to +

+
+
type* Salloc(long n, int type)
+type* Srealloc(char *p, long new, long old, int type)
+
+ +

This memory is taken from the heap, and released at the end of the +.C, .Call or .External call. Users can also manage +it, by noting the current position with a call to vmaxget and +subsequently clearing memory allocated by a call to vmaxset. An +example might be +

+
+
void *vmax = vmaxget()
+// a loop involving the use of R_alloc at each iteration
+vmaxset(vmax)
+
+ +

This is only recommended for experts. +

+

Note that this memory will be freed on error or user interrupt +(if allowed: see Allowing interrupts). +

+

Note that although n is size_t, there may be limits imposed +by R’s internal allocation mechanism. These will only come into play +on 64-bit systems, where the limit for n prior to R 3.0.0 was +just under 16Gb. +

+

The memory returned is only guaranteed to be aligned as required for +double pointers: take precautions if casting to a pointer which +needs more. As from R 3.2.0 there is also +

+
+
long double *R_allocLD(size_t n)
+
+ +

which is guaranteed to have the 16-byte alignment needed for long +double pointers on some platforms. +

+ +

These functions should only be used in code called by .C etc, +never from front-ends. They are not thread-safe. +

+
+ + + +

6.1.2 User-controlled memory

+ + + + +

The other form of memory allocation is an interface to malloc, +the interface providing R error handling. This memory lasts until +freed by the user and is additional to the memory allocated for the R +workspace. +

+

The interface functions are +

+
+
type* Calloc(size_t n, type)
+type* Realloc(any *p, size_t n, type)
+void Free(any *p)
+
+ +

providing analogues of calloc, realloc and free. +If there is an error during allocation it is handled by R, so if +these routines return the memory has been successfully allocated or +freed. Free will set the pointer p to NULL. (Some +but not all versions of S do so.) +

+

Users should arrange to Free this memory when no longer needed, +including on error or user interrupt. This can often be done most +conveniently from an on.exit action in the calling R function +– see pwilcox for an example. +

+

Do not assume that memory allocated by Calloc/Realloc +comes from the same pool as used by malloc: in particular do not +use free or strdup with it. +

+

Memory obtained by these functions should be aligned in the same way as +malloc, that is ‘suitably aligned for any kind of variable’. +

+

These entry points need to be prefixed by R_ if +STRICT_R_HEADERS has been defined. +

+ +
+ +
+

+Next: , Previous: , Up: The R API   [Contents][Index]

+
+ +

6.2 Error handling

+ + +

The basic error handling routines are the equivalents of stop and +warning in R code, and use the same interface. +

+
+
void error(const char * format, ...);
+void warning(const char * format, ...);
+
+ +

These have the same call sequences as calls to printf, but in the +simplest case can be called with a single character string argument +giving the error message. (Don’t do this if the string contains ‘%’ +or might otherwise be interpreted as a format.) +

+

If STRICT_R_HEADERS is not defined there is also an +S-compatibility interface which uses calls of the form +

+
+
PROBLEM ...... ERROR
+MESSAGE ...... WARN
+PROBLEM ...... RECOVER(NULL_ENTRY)
+MESSAGE ...... WARNING(NULL_ENTRY)
+
+ +

the last two being the forms available in all S versions. Here +‘......’ is a set of arguments to printf, so can be a string +or a format string followed by arguments separated by commas. +

+ + + + +
+ +
+

+Previous: , Up: Error handling   [Contents][Index]

+
+ +

6.2.1 Error handling from FORTRAN

+ + +

There are two interface function provided to call error and +warning from FORTRAN code, in each case with a simple character +string argument. They are defined as +

+
+
subroutine rexit(message)
+subroutine rwarn(message)
+
+ +

Messages of more than 255 characters are truncated, with a warning. +

+ +
+ + + +

6.3 Random number generation

+ + + + + + + + + + +

The interface to R’s internal random number generation routines is +

+
+
double unif_rand();
+double norm_rand();
+double exp_rand();
+
+ +

giving one uniform, normal or exponential pseudo-random variate. +However, before these are used, the user must call +

+
+
GetRNGstate();
+
+ +

and after all the required variates have been generated, call +

+
+
PutRNGstate();
+
+ +

These essentially read in (or create) .Random.seed and write it +out after use. +

+

File S.h defines seed_in and seed_out for +S-compatibility rather than GetRNGstate and +PutRNGstate. These take a long * argument which is +ignored. +

+

The random number generator is private to R; there is no way to +select the kind of RNG or set the seed except by evaluating calls to the +R functions. +

+

The C code behind R’s rxxx functions can be accessed by +including the header file Rmath.h; See Distribution functions. Those calls generate a single variate and should also be +enclosed in calls to GetRNGstate and PutRNGstate. +

+ +
+ +
+

+Next: , Previous: , Up: The R API   [Contents][Index]

+
+ +

6.4 Missing and IEEE special values

+ + + + + + + + + + +

A set of functions is provided to test for NA, Inf, +-Inf and NaN. These functions are accessed via macros: +

+
+
ISNA(x)        True for R’s NA only
+ISNAN(x)       True for R’s NA and IEEE NaN
+R_FINITE(x)    False for Inf, -Inf, NA, NaN
+
+ +

and via function R_IsNaN which is true for NaN but not +NA. +

+

Do use R_FINITE rather than isfinite or finite; the +latter is often mendacious and isfinite is only available on a +some platforms, on which R_FINITE is a macro expanding to +isfinite. +

+

Currently in C code ISNAN is a macro calling isnan. +(Since this gives problems on some C++ systems, if the R headers is +called from C++ code a function call is used.) +

+

You can check for Inf or -Inf by testing equality to +R_PosInf or R_NegInf, and set (but not test) an NA +as NA_REAL. +

+

All of the above apply to double variables only. For integer +variables there is a variable accessed by the macro NA_INTEGER +which can used to set or test for missingness. +

+ +
+ + + +

6.5 Printing

+ + + + + + +

The most useful function for printing from a C routine compiled into +R is Rprintf. This is used in exactly the same way as +printf, but is guaranteed to write to R’s output (which might +be a GUI console rather than a file, and can be re-directed by +sink). It is wise to write complete lines (including the +"\n") before returning to R. It is defined in +R_ext/Print.h. +

+

The function REprintf is similar but writes on the error stream +(stderr) which may or may not be different from the standard +output stream. +

+

Functions Rvprintf and REvprintf are analogues using the +vprintf interface. Because that is a C99 interface, they are +only defined by R_ext/Print.h in C++ code if the macro +R_USE_C99_IN_CXX is defined when it is included. +

+

Another circumstance when it may be important to use these functions is +when using parallel computation on a cluster of computational nodes, as +their output will be re-directed/logged appropriately. +

+ + + + +
+ +
+

+Previous: , Up: Printing   [Contents][Index]

+
+ +

6.5.1 Printing from FORTRAN

+ + +

On many systems FORTRAN write and print statements can be +used, but the output may not interleave well with that of C, and will be +invisible on GUI interfaces. They are not portable and best +avoided. +

+

Three subroutines are provided to ease the output of information from +FORTRAN code. +

+
+
subroutine dblepr(label, nchar, data, ndata)
+subroutine realpr(label, nchar, data, ndata)
+subroutine intpr (label, nchar, data, ndata)
+
+ +

Here label is a character label of up to 255 characters, +nchar is its length (which can be -1 if the whole label is +to be used), and data is an array of length at least ndata +of the appropriate type (double precision, real and +integer respectively). These routines print the label on one +line and then print data as if it were an R vector on +subsequent line(s). They work with zero ndata, and so can be used +to print a label alone. +

+
+ + + +

6.6 Calling C from FORTRAN and vice versa

+ + +

Naming conventions for symbols generated by FORTRAN differ by platform: +it is not safe to assume that FORTRAN names appear to C with a trailing +underscore. To help cover up the platform-specific differences there is +a set of macros that should be used. +

+
+
F77_SUB(name)
+

to define a function in C to be called from FORTRAN +

+
F77_NAME(name)
+

to declare a FORTRAN routine in C before use +

+
F77_CALL(name)
+

to call a FORTRAN routine from C +

+
F77_COMDECL(name)
+

to declare a FORTRAN common block in C +

+
F77_COM(name)
+

to access a FORTRAN common block from C +

+
+ +

On most current platforms these are all the same, but it is unwise to +rely on this. Note that names with underscores are not legal in FORTRAN +77, and are not portably handled by the above macros. (Also, all +FORTRAN names for use by R are lower case, but this is not enforced +by the macros.) +

+

For example, suppose we want to call R’s normal random numbers from +FORTRAN. We need a C wrapper along the lines of +

+ +
+
#include <R.h>
+
+void F77_SUB(rndstart)(void) { GetRNGstate(); }
+void F77_SUB(rndend)(void) { PutRNGstate(); }
+double F77_SUB(normrnd)(void) { return norm_rand(); }
+
+ +

to be called from FORTRAN as in +

+
+
      subroutine testit()
+      double precision normrnd, x
+      call rndstart()
+      x = normrnd()
+      call dblepr("X was", 5, x, 1)
+      call rndend()
+      end
+
+ +

Note that this is not guaranteed to be portable, for the return +conventions might not be compatible between the C and FORTRAN compilers +used. (Passing values via arguments is safer.) +

+

The standard packages, for example stats, are a rich source of +further examples. +

+

Passing character strings from C to FORTRAN 77 or vice versa is +not portable (and to Fortran 90 or later is even less so). We have +found that it helps to ensure that a C string to be passed is followed +by several nuls (and not just the one needed as a C terminator). +But for maximal portability character strings in FORTRAN should be +avoided. +

+ +
+ + + +

6.7 Numerical analysis subroutines

+ + +

R contains a large number of mathematical functions for its own use, +for example numerical linear algebra computations and special functions. +

+

The header files R_ext/BLAS.h, R_ext/Lapack.h and +R_ext/Linpack.h contains declarations of the BLAS, LAPACK and +LINPACK linear algebra functions included in R. These are expressed +as calls to FORTRAN subroutines, and they will also be usable from +users’ FORTRAN code. Although not part of the official API, +this set of subroutines is unlikely to change (but might be +supplemented). +

+

The header file Rmath.h lists many other functions that are +available and documented in the following subsections. Many of these are +C interfaces to the code behind R functions, so the R function +documentation may give further details. +

+ + + + + + + +
+ + + +

6.7.1 Distribution functions

+ + +

The routines used to calculate densities, cumulative distribution +functions and quantile functions for the standard statistical +distributions are available as entry points. +

+

The arguments for the entry points follow the pattern of those for the +normal distribution: +

+
+
double dnorm(double x, double mu, double sigma, int give_log);
+double pnorm(double x, double mu, double sigma, int lower_tail,
+	     int give_log);
+double qnorm(double p, double mu, double sigma, int lower_tail,
+	     int log_p);
+double rnorm(double mu, double sigma);
+
+ +

That is, the first argument gives the position for the density and CDF +and probability for the quantile function, followed by the +distribution’s parameters. Argument lower_tail should be +TRUE (or 1) for normal use, but can be FALSE (or +0) if the probability of the upper tail is desired or specified. +

+

Finally, give_log should be non-zero if the result is required on +log scale, and log_p should be non-zero if p has been +specified on log scale. +

+

Note that you directly get the cumulative (or “integrated”) +hazard function, H(t) = - log(1 - +F(t)), by using +

+
+
- pdist(t, ..., /*lower_tail = */ FALSE, /* give_log = */ TRUE)
+
+ +

or shorter (and more cryptic) - pdist(t, ..., 0, 1). + +

+

The random-variate generation routine rnorm returns one normal +variate. See Random numbers, for the protocol in using the +random-variate routines. + +

+

Note that these argument sequences are (apart from the names and that +rnorm has no n) mainly the same as the corresponding R +functions of the same name, so the documentation of the R functions +can be used. Note that the exponential and gamma distributions are +parametrized by scale rather than rate. +

+ +

For reference, the following table gives the basic name (to be prefixed +by ‘d’, ‘p’, ‘q’ or ‘r’ apart from the exceptions +noted) and distribution-specific arguments for the complete set of +distributions. +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + +
betabetaa, b
non-central betanbetaa, b, ncp
binomialbinomn, p
Cauchycauchylocation, scale
chi-squaredchisqdf
non-central chi-squarednchisqdf, ncp
exponentialexpscale (and not rate)
Ffn1, n2
non-central Fnfn1, n2, ncp
gammagammashape, scale
geometricgeomp
hypergeometrichyperNR, NB, n
logisticlogislocation, scale
lognormallnormlogmean, logsd
negative binomialnbinomsize, prob
normalnormmu, sigma
Poissonpoislambda
Student’s ttn
non-central tntdf, delta
Studentized rangetukey (*)rr, cc, df
uniformunifa, b
Weibullweibullshape, scale
Wilcoxon rank sumwilcoxm, n
Wilcoxon signed ranksignrankn
+
+ +

Entries marked with an asterisk only have ‘p’ and ‘q’ +functions available, and none of the non-central distributions have +‘r’ functions. After a call to dwilcox, pwilcox or +qwilcox the function wilcox_free() should be called, and +similarly for the signed rank functions. +

+

(If remapping is suppressed, the Normal distribution names are +Rf_dnorm4, Rf_pnorm5 and Rf_qnorm5.) +

+

For the negative binomial distribution (‘nbinom’), in addition to the +(size, prob) parametrization, the alternative (size, mu) +parametrization is provided as well by functions ‘[dpqr]nbinom_mu()’, +see ?NegBinomial in R. +

+

Functions dpois_raw(x, *) and dbinom_raw(x, *) are versions of the +Poisson and binomial probability mass functions which work continuously in +x, whereas dbinom(x,*) and dpois(x,*) only return non +zero values for integer x. +

+
double dbinom_raw(double x, double n, double p, double q, int give_log)
+double dpois_raw (double x, double lambda, int give_log)
+
+

Note that dbinom_raw() gets both p and q = 1-p which +may be advantageous when one of them is close to 1. +

+ +
+ + + +

6.7.2 Mathematical functions

+ + + + + + + + + +
+
Function: double gammafn (double x)
+
Function: double lgammafn (double x)
+
Function: double digamma (double x)
+
Function: double trigamma (double x)
+
Function: double tetragamma (double x)
+
Function: double pentagamma (double x)
+
Function: double psigamma (double x, double deriv)
+

The Gamma function, the natural logarithm of its absolute value and +first four derivatives and the n-th derivative of Psi, the digamma +function, which is the derivative of lgammafn. In other words, +digamma(x) is the same as (psigamma(x,0), +trigamma(x) == psigamma(x,1), etc. +

+ + + + +
+
Function: double beta (double a, double b)
+
Function: double lbeta (double a, double b)
+

The (complete) Beta function and its natural logarithm. +

+ + + +
+
Function: double choose (double n, double k)
+
Function: double lchoose (double n, double k)
+

The number of combinations of k items chosen from from n and +the natural logarithm of its absolute value, generalized to arbitrary real +n. k is rounded to the nearest integer (with a warning if +needed). +

+ + + + + + +
+
Function: double bessel_i (double x, double nu, double expo)
+
Function: double bessel_j (double x, double nu)
+
Function: double bessel_k (double x, double nu, double expo)
+
Function: double bessel_y (double x, double nu)
+

Bessel functions of types I, J, K and Y with index nu. For +bessel_i and bessel_k there is the option to return +exp(-x) I(xnu) or exp(x) K(xnu) if expo is 2. (Use expo == 1 for unscaled +values.) +

+ + +
+ + + +

6.7.3 Numerical Utilities

+

There are a few other numerical utility functions available as entry points. +

+ +
+
Function: double R_pow (double x, double y)
+
Function: double R_pow_di (double x, int i)
+

R_pow(x, y) and R_pow_di(x, i) +compute x^y and x^i, respectively +using R_FINITE checks and returning the proper result (the same +as R) for the cases where x, y or i are 0 or +missing or infinite or NaN. +

+ +
+
Function: double log1p (double x)
+

Computes log(1 + x) (log 1 plus x), accurately +even for small x, i.e., |x| << 1. +

+

This should be provided by your platform, in which case it is not included +in Rmath.h, but is (probably) in math.h which +Rmath.h includes. +

+ +
+
Function: double log1pmx (double x)
+

Computes log(1 + x) - x (log 1 plus x minus x), +accurately even for small x, i.e., |x| << 1. +

+ +
+
Function: double log1pexp (double x)
+

Computes log(1 + exp(x)) (log 1 plus exp), +accurately, notably for large x, e.g., x > 720. +

+ + +
+
Function: double expm1 (double x)
+

Computes exp(x) - 1 (exp x minus 1), accurately +even for small x, i.e., |x| << 1. +

+

This should be provided by your platform, in which case it is not included +in Rmath.h, but is (probably) in math.h which +Rmath.h includes. +

+ +
+
Function: double lgamma1p (double x)
+

Computes log(gamma(x + 1)) (log(gamma(1 plus x))), +accurately even for small x, i.e., 0 < x < 0.5. +

+ +
+
Function: double cospi (double x)
+

Computes cos(pi * x) (where pi is 3.14159...), +accurately, notably for half integer x. +

+

This might be provided by your platform112, in which case it is not included in Rmath.h, but is +in math.h which Rmath.h includes. +

+ +
+
Function: double sinpi (double x)
+

Computes sin(pi * x) accurately, notably for (half) integer x. +

+

This might be provided by your platform, in which case it is not included +in Rmath.h, but is in math.h which Rmath.h includes. +

+ +
+
Function: double tanpi (double x)
+

Computes tan(pi * x) accurately, notably for (half) integer x. +

+

This might be provided by your platform, in which case it is not included +in Rmath.h, but is in math.h which Rmath.h includes. +

+ +
+
Function: double logspace_add (double logx, double logy)
+
Function: double logspace_sub (double logx, double logy)
+
Function: double logspace_sum (double* logx, int n)
+

Compute the log of a sum or difference from logs of terms, i.e., “x + +y” as log (exp(logx) + exp(logy)) and “x - y” as +log (exp(logx) - exp(logy)), +and “sum_i x[i]” as log (sum[i = 1:n exp(logx[i])] ) +without causing unnecessary overflows or throwing away too much accuracy. +

+ +
+
Function: int imax2 (int x, int y)
+
Function: int imin2 (int x, int y)
+
Function: double fmax2 (double x, double y)
+
Function: double fmin2 (double x, double y)
+

Return the larger (max) or smaller (min) of two integer or +double numbers, respectively. Note that fmax2 and fmin2 +differ from C99’s fmax and fmin when one of the arguments +is a NaN: these versions return NaN. +

+ +
+
Function: double sign (double x)
+

Compute the signum function, where sign(x) is 1, 0, or +-1, when x is positive, 0, or negative, respectively, and +NaN if x is a NaN. +

+ +
+
Function: double fsign (double x, double y)
+

Performs “transfer of sign” and is defined as |x| * sign(y). +

+ +
+
Function: double fprec (double x, double digits)
+

Returns the value of x rounded to digits decimal digits +(after the decimal point). +

+

This is the function used by R’s signif(). +

+ +
+
Function: double fround (double x, double digits)
+

Returns the value of x rounded to digits significant +decimal digits. +

+

This is the function used by R’s round(). +

+ +
+
Function: double ftrunc (double x)
+

Returns the value of x truncated (to an integer value) towards +zero. +

+

Note that this is no longer needed in C code, as C99 provide a +trunc function. It is needed for portable C++98 code. +

+ +
+ + + +

6.7.4 Mathematical constants

+ + + +

R has a set of commonly used mathematical constants encompassing +constants usually found math.h and contains further ones that are +used in statistical computations. All these are defined to (at least) +30 digits accuracy in Rmath.h. The following definitions +use ln(x) for the natural logarithm (log(x) in R). +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDefinition (ln = log)round(value, 7)
M_Ee2.7182818
M_LOG2Elog2(e)1.4426950
M_LOG10Elog10(e)0.4342945
M_LN2ln(2)0.6931472
M_LN10ln(10)2.3025851
M_PIpi3.1415927
M_PI_2pi/21.5707963
M_PI_4pi/40.7853982
M_1_PI1/pi0.3183099
M_2_PI2/pi0.6366198
M_2_SQRTPI2/sqrt(pi)1.1283792
M_SQRT2sqrt(2)1.4142136
M_SQRT1_21/sqrt(2)0.7071068
M_SQRT_3sqrt(3)1.7320508
M_SQRT_32sqrt(32)5.6568542
M_LOG10_2log10(2)0.3010300
M_2PI2*pi6.2831853
M_SQRT_PIsqrt(pi)1.7724539
M_1_SQRT_2PI1/sqrt(2*pi)0.3989423
M_SQRT_2dPIsqrt(2/pi)0.7978846
M_LN_SQRT_PIln(sqrt(pi))0.5723649
M_LN_SQRT_2PIln(sqrt(2*pi))0.9189385
M_LN_SQRT_PId2ln(sqrt(pi/2))0.2257914
+
+ +

There are a set of constants (PI, DOUBLE_EPS) (and so on) +defined (unless STRICT_R_HEADERS is defined) in the included +header R_ext/Constants.h, mainly for compatibility with S. +

+ + +

Further, the included header R_ext/Boolean.h has enumeration +constants TRUE and FALSE of type Rboolean in +order to provide a way of using “logical” variables in C consistently. +This can conflict with other software: for example it conflicts with the +headers in IJG’s jpeg-9 (but not earlier versions). +

+ +
+ + + +

6.8 Optimization

+ + +

The C code underlying optim can be accessed directly. The user +needs to supply a function to compute the function to be minimized, of +the type +

+
+
typedef double optimfn(int n, double *par, void *ex);
+
+ +

where the first argument is the number of parameters in the second +argument. The third argument is a pointer passed down from the calling +routine, normally used to carry auxiliary information. +

+

Some of the methods also require a gradient function +

+
+
typedef void optimgr(int n, double *par, double *gr, void *ex);
+
+ +

which passes back the gradient in the gr argument. No function +is provided for finite-differencing, nor for approximating the Hessian +at the result. +

+

The interfaces (defined in header R_ext/Applic.h) are +

+
    +
  • Nelder Mead: + +
    +
    void nmmin(int n, double *xin, double *x, double *Fmin, optimfn fn,
    +	   int *fail, double abstol, double intol, void *ex,
    +	   double alpha, double beta, double gamma, int trace,
    +	   int *fncount, int maxit);
    +
    + +
  • BFGS: + +
    +
    void vmmin(int n, double *x, double *Fmin,
    +	   optimfn fn, optimgr gr, int maxit, int trace,
    +	   int *mask, double abstol, double reltol, int nREPORT,
    +	   void *ex, int *fncount, int *grcount, int *fail);
    +
    + +
  • Conjugate gradients: + +
    +
    void cgmin(int n, double *xin, double *x, double *Fmin,
    +	   optimfn fn, optimgr gr, int *fail, double abstol,
    +	   double intol, void *ex, int type, int trace,
    +	   int *fncount, int *grcount, int maxit);
    +
    + +
  • Limited-memory BFGS with bounds: + +
    +
    void lbfgsb(int n, int lmm, double *x, double *lower,
    +	    double *upper, int *nbd, double *Fmin, optimfn fn,
    +	    optimgr gr, int *fail, void *ex, double factr,
    +	    double pgtol, int *fncount, int *grcount,
    +	    int maxit, char *msg, int trace, int nREPORT);
    +
    + +
  • Simulated annealing: + +
    +
    void samin(int n, double *x, double *Fmin, optimfn fn, int maxit,
    +	   int tmax, double temp, int trace, void *ex);
    +
    + +
+ +

Many of the arguments are common to the various methods. n is +the number of parameters, x or xin is the starting +parameters on entry and x the final parameters on exit, with +final value returned in Fmin. Most of the other parameters can +be found from the help page for optim: see the source code +src/appl/lbfgsb.c for the values of nbd, which +specifies which bounds are to be used. +

+ +
+ +
+

+Next: , Previous: , Up: The R API   [Contents][Index]

+
+ +

6.9 Integration

+ + +

The C code underlying integrate can be accessed directly. The +user needs to supply a vectorizing C function to compute the +function to be integrated, of the type +

+
+
typedef void integr_fn(double *x, int n, void *ex);
+
+ +

where x[] is both input and output and has length n, i.e., +a C function, say fn, of type integr_fn must basically do +for(i in 1:n) x[i] := f(x[i], ex). The vectorization requirement +can be used to speed up the integrand instead of calling it n +times. Note that in the current implementation built on QUADPACK, +n will be either 15 or 21. The ex argument is a pointer +passed down from the calling routine, normally used to carry auxiliary +information. +

+

There are interfaces (defined in header R_ext/Applic.h) for +integrals over finite and infinite intervals (or “ranges” or +“integration boundaries”). +

+
    +
  • Finite: + +
    +
    void Rdqags(integr_fn f, void *ex, double *a, double *b,
    +	    double *epsabs, double *epsrel,
    +	    double *result, double *abserr, int *neval, int *ier,
    +	    int *limit, int *lenw, int *last,
    +	    int *iwork, double *work);
    +
    + +
  • Infinite: + +
    +
    void Rdqagi(integr_fn f, void *ex, double *bound, int *inf,
    +	    double *epsabs, double *epsrel,
    +	    double *result, double *abserr, int *neval, int *ier,
    +	    int *limit, int *lenw, int *last,
    +	    int *iwork, double *work);
    +
    + +
+ +

Only the 3rd and 4th argument differ for the two integrators; for the +finite range integral using Rdqags, a and b are the +integration interval bounds, whereas for an infinite range integral using +Rdqagi, bound is the finite bound of the integration (if +the integral is not doubly-infinite) and inf is a code indicating +the kind of integration range, +

+
+
inf = 1
+

corresponds to (bound, +Inf), +

+
inf = -1
+

corresponds to (-Inf, bound), +

+
inf = 2
+

corresponds to (-Inf, +Inf), +

+
+ +

f and ex define the integrand function, see above; +epsabs and epsrel specify the absolute and relative +accuracy requested, result, abserr and last are the +output components value, abs.err and subdivisions +of the R function integrate, where neval gives the number of +integrand function evaluations, and the error code ier is +translated to R’s integrate() $ message, look at that function +definition. limit corresponds to integrate(..., +subdivisions = *). It seems you should always define the two work +arrays and the length of the second one as +

+
+
    lenw = 4 * limit;
+    iwork =   (int *) R_alloc(limit, sizeof(int));
+    work = (double *) R_alloc(lenw,  sizeof(double));
+
+ +

The comments in the source code in src/appl/integrate.c give +more details, particularly about reasons for failure (ier >= 1). +

+ +
+ +
+

+Next: , Previous: , Up: The R API   [Contents][Index]

+
+ +

6.10 Utility functions

+ + +

R has a fairly comprehensive set of sort routines which are made +available to users’ C code. +The following is declared in header file Rinternals.h. +

+
+
Function: void R_orderVector (int* indx, int n, SEXP arglist, Rboolean nalast, Rboolean decreasing)
+
+

This corresponds to R’s order(..., na.last, decreasing). +More specifically, indx <- order(x, y, na.last, decreasing) corresponds to +R_orderVector(indx, n, Rf_lang2(x, y), nalast, decreasing) and for +three vectors, Rf_lang3(x,y,z) is used as arglist. +

+

Note that R_orderVector() assumes the vector indx +to be allocated to length >= n. On return, indx[] +contains a permutation of 0:(n-1), i.e., 0-based C indices (and not +1-based R indices, as R’s order()). +

+ +

All other sort routines are declared in header file +R_ext/Utils.h (included by R.h) and include the following. +

+
+
Function: void R_isort (int* x, int n)
+
Function: void R_rsort (double* x, int n)
+
Function: void R_csort (Rcomplex* x, int n)
+
Function: void rsort_with_index (double* x, int* index, int n)
+

The first three sort integer, real (double) and complex data +respectively. (Complex numbers are sorted by the real part first then +the imaginary part.) NAs are sorted last. +

+

rsort_with_index sorts on x, and applies the same +permutation to index. NAs are sorted last. +

+ +
+
Function: void revsort (double* x, int* index, int n)
+

Is similar to rsort_with_index but sorts into decreasing order, +and NAs are not handled. +

+ +
+
Function: void iPsort (int* x, int n, int k)
+
Function: void rPsort (double* x, int n, int k)
+
Function: void cPsort (Rcomplex* x, int n, int k)
+

These all provide (very) partial sorting: they permute x so that +x[k] is in the correct place with smaller values to +the left, larger ones to the right. +

+ + +
+
Function: void R_qsort (double *v, size_t i, size_t j)
+
Function: void R_qsort_I (double *v, int *I, int i, int j)
+
Function: void R_qsort_int (int *iv, size_t i, size_t j)
+
Function: void R_qsort_int_I (int *iv, int *I, int i, int j)
+
+ +

These routines sort v[i:j] or +iv[i:j] (using 1-indexing, i.e., +v[1] is the first element) calling the quicksort algorithm +as used by R’s sort(v, method = "quick") and documented on the +help page for the R function sort. The ..._I() +versions also return the sort.index() vector in I. Note +that the ordering is not stable, so tied values may be permuted. +

+

Note that NAs are not handled (explicitly) and you should +use different sorting functions if NAs can be present. +

+ +
+
Function: subroutine qsort4 (double precision v, integer indx, integer ii, integer jj)
+
Function: subroutine qsort3 (double precision v, integer ii, integer jj)
+
+

The FORTRAN interface routines for sorting double precision vectors are +qsort3 and qsort4, equivalent to R_qsort and +R_qsort_I, respectively. +

+ +
+
Function: void R_max_col (double* matrix, int* nr, int* nc, int* maxes, int* ties_meth)
+

Given the nr by nc matrix matrix in column-major +(“FORTRAN”) +order, R_max_col() returns in maxes[i-1] the +column number of the maximal element in the i-th row (the same as +R’s max.col() function). In the case of ties (multiple maxima), +*ties_meth is an integer code in 1:3 determining the method: +1 = “random”, 2 = “first” and 3 = “last”. +See R’s help page ?max.col. +

+ +
+
Function: int findInterval (double* xt, int n, double x, Rboolean rightmost_closed, Rboolean all_inside, int ilo, int* mflag)
+

Given the ordered vector xt of length n, return the interval +or index of x in xt[], typically max(i; 1 <= i <= n & xt[i] <= +x) where we use 1-indexing as in R and FORTRAN (but not C). If +rightmost_closed is true, also returns n-1 if x +equals xt[n]. If all_inside is not 0, the +result is coerced to lie in 1:(n-1) even when x is +outside the xt[] range. On return, *mflag equals +-1 if x < xt[1], +1 if x >= +xt[n], and 0 otherwise. +

+

The algorithm is particularly fast when ilo is set to the last +result of findInterval() and x is a value of a sequence which +is increasing or decreasing for subsequent calls. +

+

There is also an F77_CALL(interv)() version of +findInterval() with the same arguments, but all pointers. +

+ +

A system-independent interface to produce the name of a temporary +file is provided as +

+
+
Function: char * R_tmpnam (const char *prefix, const char *tmpdir)
+
Function: char * R_tmpnam2 (const char *prefix, const char *tmpdir, const char *fileext)
+

Return a pathname for a temporary file with name beginning with +prefix and ending with fileext in directory tmpdir. +A NULL prefix or extension is replaced by "". Note that +the return value is malloced and should be freed when no +longer needed (unlike the system call tmpnam). +

+ + +

There is also the internal function used to expand file names in several +R functions, and called directly by path.expand. +

+
+
Function: const char * R_ExpandFileName (const char *fn)
+

Expand a path name fn by replacing a leading tilde by the user’s +home directory (if defined). The precise meaning is platform-specific; +it will usually be taken from the environment variable HOME if +this is defined. +

+ +

For historical reasons there are FORTRAN interfaces to functions +D1MACH and I1MACH. These can be called from C code as +e.g. F77_CALL(d1mach)(4). Note that these are emulations of +the original functions by Fox, Hall and Schryer on NetLib at +http://www.netlib.org/slatec/src/ for IEC 60559 arithmetic +(required by R). +

+
+ + + +

6.11 Re-encoding

+ +

R has its own C-level interface to the encoding conversion +capabilities provided by iconv because there are +incompatibilities between the declarations in different implementations +of iconv. +

+

These are declared in header file R_ext/Riconv.h. +

+
+
Function: void * Riconv_open (const char *to, const char *from)
+
+

Set up a pointer to an encoding object to be used to convert between two +encodings: "" indicates the current locale. +

+
+
Function: size_t Riconv (void *cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
+
+

Convert as much as possible of inbuf to outbuf. Initially +the int variables indicate the number of bytes available in the +buffers, and they are updated (and the char pointers are updated +to point to the next free byte in the buffer). The return value is the +number of characters converted, or (size_t)-1 (beware: +size_t is usually an unsigned type). It should be safe to assume +that an error condition sets errno to one of E2BIG (the +output buffer is full), EILSEQ (the input cannot be converted, +and might be invalid in the encoding specified) or EINVAL (the +input does not end with a complete multi-byte character). +

+
+
Function: int Riconv_close (void * cd)
+
+

Free the resources of an encoding object. +

+ +
+ + + +

6.12 Allowing interrupts

+ + +

No port of R can be interrupted whilst running long computations in +compiled code, so programmers should make provision for the code to be +interrupted at suitable points by calling from C +

+
+
#include <R_ext/Utils.h>
+
+void R_CheckUserInterrupt(void);
+
+ +

and from FORTRAN +

+
+
subroutine rchkusr()
+
+ +

These check if the user has requested an interrupt, and if so branch to +R’s error handling functions. +

+

Note that it is possible that the code behind one of the entry points +defined here if called from your C or FORTRAN code could be interruptible +or generate an error and so not return to your code. +

+ +
+ + + +

6.13 Platform and version information

+ + + + +

The header files define USING_R, which can be used to test if +the code is indeed being used with R. +

+

Header file Rconfig.h (included by R.h) is used to define +platform-specific macros that are mainly for use in other header files. +The macro WORDS_BIGENDIAN is defined on +big-endian113 +systems (e.g. most OSes on Sparc and PowerPC hardware) and not on +little-endian systems (such as i686 and x86_64 on all +OSes, and Linux on Alpha and Itanium). It can be useful when +manipulating binary files. The macro SUPPORT_OPENMP is defined +on suitable systems and can be used in conjunction with the +SUPPORT_OPENMP_* macros in packages that want to make use of +OpenMP. +

+

Header file Rversion.h (not included by R.h) +defines a macro R_VERSION giving the version number encoded as an +integer, plus a macro R_Version to do the encoding. This can be +used to test if the version of R is late enough, or to include +back-compatibility features. For protection against very old versions +of R which did not have this macro, use a construction such as +

+
+
#if defined(R_VERSION) && R_VERSION >= R_Version(3, 1, 0)
+  ...
+#endif
+
+ +

More detailed information is available in the macros R_MAJOR, +R_MINOR, R_YEAR, R_MONTH and R_DAY: see the +header file Rversion.h for their format. Note that the minor +version includes the patchlevel (as in ‘2.2’). +

+

Packages which use alloca need to ensure it is defined: as it is +neither C99 nor POSIX there is no standard way to do so. As from R +3.2.2 one can use +

+
+
#include <Rconfig.h> // for HAVE_ALLOCA_H
+#ifdef __GNUC__
+// this covers gcc, clang, icc
+# undef alloca
+# define alloca(x) __builtin_alloca((x))
+#elif defined(HAVE_ALLOCA_H)
+// needed for native compilers on Solaris and AIX
+# include <alloca.h>
+#endif
+
+ +

(and this should be included before standard C headers such as +stdlib.h, since on some platforms these include malloc.h +which may have a conflicting definition), which suffices for known R +platforms. +

+
+ + + +

6.14 Inlining C functions

+ + +

The C99 keyword inline should be recognized by all compilers now +used to build R. Portable code which might be used with earlier +versions of R can be written using the macro R_INLINE (defined +in file Rconfig.h included by R.h), as for example from +package cluster +

+
+
#include <R.h>
+
+static R_INLINE int ind_2(int l, int j)
+{
+...
+}
+
+ +

Be aware that using inlining with functions in more than one compilation +unit is almost impossible to do portably, see +http://www.greenend.org.uk/rjk/2003/03/inline.html, so this usage +is for static functions as in the example. All the R +configure code has checked is that R_INLINE can be used in a +single C file with the compiler used to build R. We recommend that +packages making extensive use of inlining include their own configure +code. +

+
+ + + +

6.15 Controlling visibility

+ + +

Header R_ext/Visibility has some definitions for controlling the +visibility of entry points. These are only effective when +‘HAVE_VISIBILITY_ATTRIBUTE’ is defined – this is checked when R +is configured and recorded in header Rconfig.h (included by +R_ext/Visibility.h). It is generally defined on modern +Unix-alikes with a recent compiler, but not supported on OS X nor +Windows. Minimizing the visibility of symbols in a shared library will +both speed up its loading (unlikely to be significant) and reduce the +possibility of linking to other entry points of the same name. +

+

C/C++ entry points prefixed by attribute_hidden will not be +visible in the shared object. There is no comparable mechanism for +FORTRAN entry points, but there is a more comprehensive scheme used by, +for example package stats. Most compilers which allow control of +visibility will allow control of visibility for all symbols via a flag, +and where known the flag is encapsulated in the macros +‘C_VISIBILITY’ and F77_VISIBILITY for C and FORTRAN +compilers. These are defined in etc/Makeconf and so available +for normal compilation of package code. For example, +src/Makevars could include +

+
+
PKG_CFLAGS=$(C_VISIBILITY)
+PKG_FFLAGS=$(F77_VISIBILITY)
+
+ +

This would end up with no visible entry points, which would be +pointless. However, the effect of the flags can be overridden by using +the attribute_visible prefix. A shared object which registers +its entry points needs only for have one visible entry point, its +initializer, so for example package stats has +

+
+
void attribute_visible R_init_stats(DllInfo *dll)
+{
+    R_registerRoutines(dll, CEntries, CallEntries, FortEntries, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+...
+}
+
+ +

The visibility mechanism is not available on Windows, but there is an +equally effective way to control which entry points are visible, by +supplying a definitions file +pkgnme/src/pkgname-win.def: only entry points +listed in that file will be visible. Again using stats as an +example, it has +

+
+
LIBRARY stats.dll
+EXPORTS
+ R_init_stats
+
+ +
+ + + +

6.16 Using these functions in your own C code

+ +

It is possible to build Mathlib, the R set of mathematical +functions documented in Rmath.h, as a standalone library +libRmath under both Unix-alikes and Windows. (This includes the +functions documented in Numerical analysis subroutines as from +that header file.) +

+

The library is not built automatically when R is installed, but can +be built in the directory src/nmath/standalone in the R +sources: see the file README there. To use the code in your own +C program include +

+
+
#define MATHLIB_STANDALONE
+#include <Rmath.h>
+
+ +

and link against ‘-lRmath’ (and perhaps ‘-lm’). There is an +example file test.c. +

+

A little care is needed to use the random-number routines. You will +need to supply the uniform random number generator +

+
+
double unif_rand(void)
+
+ +

or use the one supplied (and with a dynamic library or DLL you will have +to use the one supplied, which is the Marsaglia-multicarry with an entry +points +

+
+
set_seed(unsigned int, unsigned int)
+
+ +

to set its seeds and +

+
+
get_seed(unsigned int *, unsigned int *)
+
+ +

to read the seeds). +

+
+ +
+

+Previous: , Up: The R API   [Contents][Index]

+
+ +

6.17 Organization of header files

+ +

The header files which R installs are in directory +R_INCLUDE_DIR (default R_HOME/include). This +currently includes +

+
+ + + + + + + + + + + + + + + + + + + + + + +
R.hincludes many other files
S.hdifferent version for code ported from S
Rinternals.hdefinitions for using R’s internal +structures
Rdefines.hmacros for an S-like interface to the +above (no longer maintained)
Rmath.hstandalone math library
Rversion.hR version information
Rinterface.hfor add-on front-ends (Unix-alikes only)
Rembedded.hfor add-on front-ends
R_ext/Applic.hoptimization and integration
R_ext/BLAS.hC definitions for BLAS routines
R_ext/Callbacks.hC (and R function) top-level task +handlers
R_ext/GetX11Image.hX11Image interface used by package +trkplot
R_ext/Lapack.hC definitions for some LAPACK routines
R_ext/Linpack.hC definitions for some LINPACK +routines, not all of which are included in R
R_ext/Parse.ha small part of R’s parse interface: +not part of the stable API.
R_ext/RStartup.hfor add-on front-ends
R_ext/Rdynload.hneeded to register compiled code in +packages
R_ext/R-ftp-http.hinterface to internal method of +download.file
R_ext/Riconv.hinterface to iconv
R_ext/Visibility.hdefinitions controlling visibility
R_ext/eventloop.hfor add-on front-ends and for +packages that need to share in the R event loops (on all platforms)
+
+ +

The following headers are included by R.h: +

+
+ + + + + + + + + + + + + +
Rconfig.hconfiguration info that is made available
R_ext/Arith.hhandling for NAs, NaNs, +Inf/-Inf
R_ext/Boolean.hTRUE/FALSE type
R_ext/Complex.hC typedefs for R’s complex
R_ext/Constants.hconstants
R_ext/Error.herror handling
R_ext/Memory.hmemory allocation
R_ext/Print.hRprintf and variations.
R_ext/RS.hdefinitions common to R.h and +S.h, including F77_CALL etc.
R_ext/Random.hrandom number generation
R_ext/Utils.hsorting and other utilities
R_ext/libextern.hdefinitions for exports from +R.dll on Windows.
+
+ +

The graphics systems are exposed in headers +R_ext/GraphicsEngine.h, R_ext/GraphicsDevice.h (which it +includes) and R_ext/QuartzDevice.h. Facilities for defining +custom connection implementations are provided in +R_ext/Connections.h, but make sure you consult the file before +use. +

+

Let us re-iterate the advice to include system headers before the R +header files, especially Rinternals.h (included by +Rdefines.h) and Rmath.h, which redefine names which may be +used in system headers (fewer if ‘R_NO_REMAP’ is defined, or +‘R_NO_REMAP_RMATH’ for Rmath.h, as from R 3.1.0). +

+
+ + + +

7 Generic functions and methods

+ + + +

R programmers will often want to add methods for existing generic +functions, and may want to add new generic functions or make existing +functions generic. In this chapter we give guidelines for doing so, +with examples of the problems caused by not adhering to them. +

+

This chapter only covers the ‘informal’ class system copied from S3, +and not with the S4 (formal) methods of package methods. +

+

First, a caveat: a function named gen.cl will +be invoked by the generic gen for class cl, so +do not name functions in this style unless they are intended to be +methods. +

+

The key function for methods is NextMethod, which dispatches the +next method. It is quite typical for a method function to make a few +changes to its arguments, dispatch to the next method, receive the +results and modify them a little. An example is +

+
+
t.data.frame <- function(x)
+{
+    x <- as.matrix(x)
+    NextMethod("t")
+}
+
+ +

Note that the example above works because there is a next method, +the default method, not that a new method is selected when the class is +changed. +

+

Any method a programmer writes may be invoked from another method +by NextMethod, with the arguments appropriate to the +previous method. Further, the programmer cannot predict which method +NextMethod will pick (it might be one not yet dreamt of), and the +end user calling the generic needs to be able to pass arguments to the +next method. For this to work +

+
+

A method must have all the arguments of the generic, including + if the generic does. +

+ +

It is a grave misunderstanding to think that a method needs only to +accept the arguments it needs. The original S version of +predict.lm did not have a argument, although +predict did. It soon became clear that predict.glm needed +an argument dispersion to handle over-dispersion. As +predict.lm had neither a dispersion nor a +argument, NextMethod could no longer be used. (The legacy, two +direct calls to predict.lm, lives on in predict.glm in +R, which is based on the workaround for S3 written by Venables & +Ripley.) +

+

Further, the user is entitled to use positional matching when calling +the generic, and the arguments to a method called by UseMethod +are those of the call to the generic. Thus +

+
+

A method must have arguments in exactly the same order as the +generic. +

+ +

To see the scale of this problem, consider the generic function +scale, defined as +

+
+
scale <- function (x, center = TRUE, scale = TRUE)
+    UseMethod("scale")
+
+ +

Suppose an unthinking package writer created methods such as +

+
+
scale.foo <- function(x, scale = FALSE, ...) { }
+
+ +

Then for x of class "foo" the calls +

+
+
scale(x, , TRUE)
+scale(x, scale = TRUE)
+
+ +

would do most likely do different things, to the justifiable +consternation of the end user. +

+

To add a further twist, which default is used when a user calls +scale(x) in our example? What if +

+
+
scale.bar <- function(x, center, scale = TRUE) NextMethod("scale")
+
+ +

and x has class c("bar", "foo")? It is the default +specified in the method that is used, but the default +specified in the generic may be the one the user sees. +This leads to the recommendation: +

+
+

If the generic specifies defaults, all methods should use the same defaults. +

+ +

An easy way to follow these recommendations is to always keep generics +simple, e.g. +

+
+
scale <- function(x, ...) UseMethod("scale")
+
+ +

Only add parameters and defaults to the generic if they make sense in +all possible methods implementing it. +

+ + + + +
+ + + +

7.1 Adding new generics

+ +

When creating a new generic function, bear in mind that its argument +list will be the maximal set of arguments for methods, including those +written elsewhere years later. So choosing a good set of arguments may +well be an important design issue, and there need to be good arguments +not to include a argument. +

+

If a argument is supplied, some thought should be given +to its position in the argument sequence. Arguments which follow + must be named in calls to the function, and they must be +named in full (partial matching is suppressed after ). +Formal arguments before can be partially matched, and so +may ‘swallow’ actual arguments intended for . Although it +is commonplace to make the argument the last one, that is +not always the right choice. +

+

Sometimes package writers want to make generic a function in the base +package, and request a change in R. This may be justifiable, but +making a function generic with the old definition as the default method +does have a small performance cost. It is never necessary, as a package +can take over a function in the base package and make it generic by +something like +

+
+
foo <- function(object, ...) UseMethod("foo")
+foo.default <- function(object, ...) base::foo(object)
+
+ +

Earlier versions of this manual suggested assigning foo.default <- +base::foo. This is not a good idea, as it captures the base +function at the time of installation and it might be changed as R is +patched or updated. +

+

The same idea can be applied for functions in other packages with namespaces. +

+
+ + + +

8 Linking GUIs and other front-ends to R

+ +

There are a number of ways to build front-ends to R: we take this to +mean a GUI or other application that has the ability to submit commands +to R and perhaps to receive results back (not necessarily in a text +format). There are other routes besides those described here, for +example the package Rserve (from CRAN, see also +https://www.rforge.net/Rserve/) and connections to Java in +‘JRI’ (part of the rJava package on CRAN) and +the Omegahat/Bioconductor package ‘SJava’. +

+

Note that the APIs described in this chapter are only intended to be +used in an alternative front-end: they are not part of the API made +available for R packages and can be dangerous to use in a +conventional package (although packages may contain alternative +front-ends). Conversely some of the functions from the API (such as +R_alloc) should not be used in front-ends. +

+ + + + + +
+ + + +

8.1 Embedding R under Unix-alikes

+ +

R can be built as a shared library114 if configured with --enable-R-shlib. This +shared library can be used to run R from alternative front-end +programs. We will assume this has been done for the rest of this +section. Also, it can be built as a static library if configured with +--enable-R-static-lib, and that can be used in a very similar +way (at least on Linux: on other platforms one needs to ensure that all +the symbols exported by libR.a are linked into the front-end). +

+

The command-line R front-end, R_HOME/bin/exec/R, is one +such example, and the former GNOME (see package gnomeGUI +on CRAN’s ‘Archive’ area) and OS X consoles are others. +The source for R_HOME/bin/exec/R is in file +src/main/Rmain.c and is very simple +

+
+
int Rf_initialize_R(int ac, char **av); /* in ../unix/system.c */
+void Rf_mainloop();                     /* in main.c */
+
+extern int R_running_as_main_program;   /* in ../unix/system.c */
+
+int main(int ac, char **av)
+{
+    R_running_as_main_program = 1;
+    Rf_initialize_R(ac, av);
+    Rf_mainloop(); /* does not return */
+    return 0;
+}
+
+ +

indeed, misleadingly simple. Remember that +R_HOME/bin/exec/R is run from a shell script +R_HOME/bin/R which sets up the environment for the +executable, and this is used for +

+
    +
  • Setting R_HOME and checking it is valid, as well as the path +R_SHARE_DIR and R_DOC_DIR to the installed share and +doc directory trees. Also setting R_ARCH if needed. + +
  • Setting LD_LIBRARY_PATH to include the directories used in linking +R. This is recorded as the default setting of +R_LD_LIBRARY_PATH in the shell script +R_HOME/etcR_ARCH/ldpaths. + +
  • Processing some of the arguments, for example to run R under a +debugger and to launch alternative front-ends to provide GUIs. +
+ +

The first two of these can be achieved for your front-end by running it +via R CMD. So, for example +

+
+
R CMD /usr/local/lib/R/bin/exec/R
+R CMD exec/R
+
+ +

will both work in a standard R installation. (R CMD looks +first for executables in R_HOME/bin. These command-lines +need modification if a sub-architecture is in use.) If you do not want +to run your front-end in this way, you need to ensure that R_HOME +is set and LD_LIBRARY_PATH is suitable. (The latter might well +be, but modern Unix/Linux systems do not normally include +/usr/local/lib (/usr/local/lib64 on some architectures), +and R does look there for system components.) +

+

The other senses in which this example is too simple are that all the +internal defaults are used and that control is handed over to the +R main loop. There are a number of small examples115 in the +tests/Embedding directory. These make use of +Rf_initEmbeddedR in src/main/Rembedded.c, and essentially +use +

+
#include <Rembedded.h>
+
+int main(int ac, char **av)
+{
+    /* do some setup */
+    Rf_initEmbeddedR(argc, argv);
+    /* do some more setup */
+
+    /* submit some code to R, which is done interactively via
+	run_Rmainloop();
+
+	A possible substitute for a pseudo-console is
+
+	R_ReplDLLinit();
+	while(R_ReplDLLdo1() > 0) {
+	/* add user actions here if desired */
+       }
+
+     */
+    Rf_endEmbeddedR(0);
+    /* final tidying up after R is shutdown */
+    return 0;
+}
+
+ +

If you do not want to pass R arguments, you can fake an argv +array, for example by +

+
+
    char *argv[]= {"REmbeddedPostgres", "--silent"};
+    Rf_initEmbeddedR(sizeof(argv)/sizeof(argv[0]), argv);
+
+ +

However, to make a GUI we usually do want to run run_Rmainloop +after setting up various parts of R to talk to our GUI, and arranging +for our GUI callbacks to be called during the R mainloop. +

+

One issue to watch is that on some platforms Rf_initEmbeddedR and +Rf_endEmbeddedR change the settings of the FPU (e.g. to allow +errors to be trapped and to make use of extended precision registers). +

+

The standard code sets up a session temporary directory in the usual +way, unless R_TempDir is set to a non-NULL value before +Rf_initEmbeddedR is called. In that case the value is assumed to +contain an existing writable directory (no check is done), and it is not +cleaned up when R is shut down. +

+

Rf_initEmbeddedR sets R to be in interactive mode: you can set +R_Interactive (defined in Rinterface.h) subsequently to +change this. +

+

Note that R expects to be run with the locale category +‘LC_NUMERIC’ set to its default value of C, and so should +not be embedded into an application which changes that. +

+

It is the user’s responsibility to attempt to initialize only once. To +protect the R interpreter, Rf_initialize_R will exit the +process if re-initialization is attempted. +

+ + + + + + + + +
+ + + +

8.1.1 Compiling against the R library

+ +

Suitable flags to compile and link against the R (shared or static) +library can be found by +

+
+
R CMD config --cppflags
+R CMD config --ldflags
+
+ +

(These apply only to an uninstalled copy or a standard install.) +

+

If R is installed, pkg-config is available and neither +sub-architectures nor an OS X framework have been used, alternatives for +a shared R library are +

+
+
pkg-config --cflags libR
+pkg-config --libs libR
+
+ +

and for a static R library +

+
+
pkg-config --cflags libR
+pkg-config --libs --static libR
+
+ +

(This may work for an installed OS framework if pkg-config is +taught where to look for libR.pc: it is installed inside the +framework.) +

+

However, a more comprehensive way is to set up a Makefile to +compile the front-end. Suppose file myfe.c is to be compiled to +myfe. A suitable Makefile might be +

+
+
include ${R_HOME}/etc${R_ARCH}/Makeconf
+all: myfe
+
+## The following is not needed, but avoids PIC flags.
+myfe.o: myfe.c
+        $(CC) $(ALL_CPPFLAGS) $(CFLAGS) -c myfe.c -o $@
+
+## replace $(LIBR) $(LIBS) by $(STATIC_LIBR) if R was build with a static libR
+myfe: myfe.o
+        $(MAIN_LINK) -o $@ myfe.o $(LIBR) $(LIBS)
+
+ +

invoked as +

+
+
R CMD make
+R CMD myfe
+
+ +

Additional flags which $(MAIN_LINK) includes are, amongst others, +those to select OpenMP and --export-dynamic for the GNU linker +on some platforms. In principle $(LIBS) is not needed +when using a shared R library as libR is linked against +those libraries, but some platforms need the executable also linked +against them. +

+
+ + + +

8.1.2 Setting R callbacks

+ +

For Unix-alikes there is a public header file Rinterface.h that +makes it possible to change the standard callbacks used by R in a +documented way. This defines pointers (if R_INTERFACE_PTRS is +defined) +

+
+
extern void (*ptr_R_Suicide)(const char *);
+extern void (*ptr_R_ShowMessage)(const char *);
+extern int  (*ptr_R_ReadConsole)(const char *, unsigned char *, int, int);
+extern void (*ptr_R_WriteConsole)(const char *, int);
+extern void (*ptr_R_WriteConsoleEx)(const char *, int, int);
+extern void (*ptr_R_ResetConsole)();
+extern void (*ptr_R_FlushConsole)();
+extern void (*ptr_R_ClearerrConsole)();
+extern void (*ptr_R_Busy)(int);
+extern void (*ptr_R_CleanUp)(SA_TYPE, int, int);
+extern int  (*ptr_R_ShowFiles)(int, const char **, const char **,
+                               const char *, Rboolean, const char *);
+extern int  (*ptr_R_ChooseFile)(int, char *, int);
+extern int  (*ptr_R_EditFile)(const char *);
+extern void (*ptr_R_loadhistory)(SEXP, SEXP, SEXP, SEXP);
+extern void (*ptr_R_savehistory)(SEXP, SEXP, SEXP, SEXP);
+extern void (*ptr_R_addhistory)(SEXP, SEXP, SEXP, SEXP);
+// added in R 3.0.0
+extern int  (*ptr_R_EditFiles)(int, const char **, const char **, const char *);
+extern SEXP (*ptr_do_selectlist)(SEXP, SEXP, SEXP, SEXP);
+extern SEXP (*ptr_do_dataentry)(SEXP, SEXP, SEXP, SEXP);
+extern SEXP (*ptr_do_dataviewer)(SEXP, SEXP, SEXP, SEXP);
+extern void (*ptr_R_ProcessEvents)();
+
+ +

which allow standard R callbacks to be redirected to your GUI. What +these do is generally documented in the file src/unix/system.txt. +

+
+
Function: void R_ShowMessage (char *message)
+

This should display the message, which may have multiple lines: it +should be brought to the user’s attention immediately. +

+ +
+
Function: void R_Busy (int which)
+

This function invokes actions (such as change of cursor) when R +embarks on an extended computation (which=1) and when such +a state terminates (which=0). +

+ +
+
Function: int R_ReadConsole (const char *prompt, unsigned char *buf, int buflen, int hist)
+
Function: void R_WriteConsole (const char *buf, int buflen)
+
Function: void R_WriteConsoleEx (const char *buf, int buflen, int otype)
+
Function: void R_ResetConsole ()
+
Function: void R_FlushConsole ()
+
Function: void R_ClearErrConsole ()
+
+

These functions interact with a console. +

+

R_ReadConsole prints the given prompt at the console and then +does a fgets(3)–like operation, transferring up to buflen +characters into the buffer buf. The last two bytes should be +set to ‘"\n\0"’ to preserve sanity. If hist is non-zero, +then the line should be added to any command history which is being +maintained. The return value is 0 is no input is available and >0 +otherwise. +

+

R_WriteConsoleEx writes the given buffer to the console, +otype specifies the output type (regular output or +warning/error). Call to R_WriteConsole(buf, buflen) is equivalent +to R_WriteConsoleEx(buf, buflen, 0). To ensure backward +compatibility of the callbacks, ptr_R_WriteConsoleEx is used only +if ptr_R_WriteConsole is set to NULL. To ensure that +stdout() and stderr() connections point to the console, +set the corresponding files to NULL via +

+
      R_Outputfile = NULL;
+      R_Consolefile = NULL;
+
+ +

R_ResetConsole is called when the system is reset after an error. +R_FlushConsole is called to flush any pending output to the +system console. R_ClearerrConsole clears any errors associated +with reading from the console. +

+ +
+
Function: int R_ShowFiles (int nfile, const char **file, const char **headers, const char *wtitle, Rboolean del, const char *pager)
+
+

This function is used to display the contents of files. +

+ +
+
Function: int R_ChooseFile (int new, char *buf, int len)
+
+

Choose a file and return its name in buf of length len. +Return value is 0 for success, > 0 otherwise. +

+ +
+
Function: int R_EditFile (const char *buf)
+

Send a file to an editor window. +

+ +
+
Function: int R_EditFiles (int nfile, const char **file, const char **title, const char *editor)
+

Send nfile files to an editor, with titles possibly to be used for +the editor window(s). +

+ +
+
Function: SEXP R_loadhistory (SEXP, SEXP, SEXP, SEXP);
+
Function: SEXP R_savehistory (SEXP, SEXP, SEXP, SEXP);
+
Function: SEXP R_addhistory (SEXP, SEXP, SEXP, SEXP);
+
+

.Internal functions for loadhistory, savehistory +and timestamp. +

+

If the console has no history mechanism these can be as +simple as +

+
+
SEXP R_loadhistory (SEXP call, SEXP op, SEXP args, SEXP env)
+{
+    errorcall(call, "loadhistory is not implemented");
+    return R_NilValue;
+}
+SEXP R_savehistory (SEXP call, SEXP op , SEXP args, SEXP env)
+{
+    errorcall(call, "savehistory is not implemented");
+    return R_NilValue;
+}
+SEXP R_addhistory (SEXP call, SEXP op , SEXP args, SEXP env)
+{
+    return R_NilValue;
+}
+
+ +

The R_addhistory function should return silently if no history +mechanism is present, as a user may be calling timestamp purely +to write the time stamp to the console. +

+ +
+
Function: void R_Suicide (const char *message)
+

This should abort R as rapidly as possible, displaying the message. +A possible implementation is +

+
+
void R_Suicide (const char *message)
+{
+    char  pp[1024];
+    snprintf(pp, 1024, "Fatal error: %s\n", s);
+    R_ShowMessage(pp);
+    R_CleanUp(SA_SUICIDE, 2, 0);
+}
+
+
+ +
+
Function: void R_CleanUp (SA_TYPE saveact, int status, int RunLast)
+
+

This function invokes any actions which occur at system termination. +It needs to be quite complex: +

+
+
#include <Rinterface.h>
+#include <Rembedded.h>    /* for Rf_KillAllDevices */
+
+void R_CleanUp (SA_TYPE saveact, int status, int RunLast)
+{
+    if(saveact == SA_DEFAULT) saveact = SaveAction;
+    if(saveact == SA_SAVEASK) {
+       /* ask what to do and set saveact */
+    }
+    switch (saveact) {
+    case SA_SAVE:
+        if(runLast) R_dot_Last();
+        if(R_DirtyImage) R_SaveGlobalEnv();
+        /* save the console history in R_HistoryFile */
+        break;
+    case SA_NOSAVE:
+        if(runLast) R_dot_Last();
+        break;
+    case SA_SUICIDE:
+    default:
+        break;
+    }
+
+    R_RunExitFinalizers();
+    /* clean up after the editor e.g. CleanEd() */
+
+    R_CleanTempDir();
+
+    /* close all the graphics devices */
+    if(saveact != SA_SUICIDE) Rf_KillAllDevices();
+    fpu_setup(FALSE);
+
+    exit(status);
+}
+
+
+ +

These callbacks should never be changed in a running R session (and +hence cannot be called from an extension package). +

+
+
Function: SEXP R_dataentry (SEXP, SEXP, SEXP, SEXP);
+
Function: SEXP R_dataviewer (SEXP, SEXP, SEXP, SEXP);
+
Function: SEXP R_selectlist (SEXP, SEXP, SEXP, SEXP);
+
+

.External functions for dataentry (and edit on +matrices and data frames), View and select.list. These +can be changed if they are not currently in use. +

+ + +
+ + + +

8.1.3 Registering symbols

+ +

An application embedding R needs a different way of registering +symbols because it is not a dynamic library loaded by R as would be +the case with a package. Therefore R reserves a special +DllInfo entry for the embedding application such that it can +register symbols to be used with .C, .Call etc. This +entry can be obtained by calling getEmbeddingDllInfo, so a +typical use is +

+
+
DllInfo *info = R_getEmbeddingDllInfo();
+R_registerRoutines(info, cMethods, callMethods, NULL, NULL);
+
+ +

The native routines defined by cMethods and callMethods +should be present in the embedding application. See Registering native routines for details on registering symbols in general. +

+ +
+ + + +

8.1.4 Meshing event loops

+ +

One of the most difficult issues in interfacing R to a front-end is +the handling of event loops, at least if a single thread is used. R +uses events and timers for +

+
    +
  • Running X11 windows such as the graphics device and data editor, and +interacting with them (e.g., using locator()). + +
  • Supporting Tcl/Tk events for the tcltk package (for at least the +X11 version of Tk). + +
  • Preparing input. + +
  • Timing operations, for example for profiling R code and +Sys.sleep(). + +
  • Interrupts, where permitted. +
+ +

Specifically, the Unix-alike command-line version of R runs separate +event loops for +

+
    +
  • Preparing input at the console command-line, in file +src/unix/sys-unix.c. + +
  • Waiting for a response from a socket in the internal functions +underlying FTP and HTTP transfers in download.file() and for +direct socket access, in files +src/modules/internet/nanoftp.c, +src/modules/internet/nanohttp.c and +src/modules/internet/Rsock.c + +
  • Mouse and window events when displaying the X11-based dataentry window, +in file src/modules/X11/dataentry.c. This is regarded as +modal, and no other events are serviced whilst it is active. +
+ +

There is a protocol for adding event handlers to the first two types of +event loops, using types and functions declared in the header +R_ext/eventloop.h and described in comments in file +src/unix/sys-std.c. It is possible to add (or remove) an input +handler for events on a particular file descriptor, or to set a polling +interval (via R_wait_usec) and a function to be called +periodically via R_PolledEvents: the polling mechanism is used by +the tcltk package. +

+

It is not intended that these facilities are used by packages, but if +they are needed exceptionally, the package should ensure that it cleans +up and removes its handlers when its namespace is unloaded. +

+

An alternative front-end needs both to make provision for other R +events whilst waiting for input, and to ensure that it is not frozen out +during events of the second type. This is not handled very well in the +existing examples. The GNOME front-end ran a private handler for polled +events by setting +

+
+
extern int (*R_timeout_handler)();
+extern long R_timeout_val;
+
+      if (R_timeout_handler && R_timeout_val)
+          gtk_timeout_add(R_timeout_val, R_timeout_handler, NULL);
+      gtk_main ();
+
+ +

whilst it is waiting for console input. This obviously handles events +for Gtk windows (such as the graphics device in the gtkDevice +package), but not X11 events (such as the X11() device) or for +other event handlers that might have been registered with R. It does +not attempt to keep itself alive whilst R is waiting on sockets. The +ability to add a polled handler as R_timeout_handler is used by +the tcltk package. +

+ +
+ + + +

8.1.5 Threading issues

+ +

Embedded R is designed to be run in the main thread, and all the +testing is done in that context. There is a potential issue with the +stack-checking mechanism where threads are involved. This uses two +variables declared in Rinterface.h (if CSTACK_DEFNS is +defined) as +

+
+
extern uintptr_t R_CStackLimit; /* C stack limit */
+extern uintptr_t R_CStackStart; /* Initial stack address */
+
+ +

Note that uintptr_t is a C99 type for which a substitute is +defined in R, so your code needs to define HAVE_UINTPTR_T +appropriately. +

+

These will be set116 when Rf_initialize_R is called, to values appropriate to the +main thread. Stack-checking can be disabled by setting +R_CStackLimit = (uintptr_t)-1 immediately after +Rf_initialize_R is called, but it is better to if possible set +appropriate values. (What these are and how to determine them are +OS-specific, and the stack size limit may differ for secondary threads. +If you have a choice of stack size, at least 10Mb is recommended.) +

+

You may also want to consider how signals are handled: R sets signal +handlers for several signals, including SIGINT, SIGSEGV, +SIGPIPE, SIGUSR1 and SIGUSR2, but these can all be +suppressed by setting the variable R_SignalHandlers (declared in +Rinterface.h) to 0. +

+

Note that these variables must not be changed by an R +package: a package should not calling R internals which +makes use of the stack-checking mechanism on a secondary thread. +

+
+ + + +

8.2 Embedding R under Windows

+ +

All Windows interfaces to R call entry points in the DLL +R.dll, directly or indirectly. Simpler applications may find it +easier to use the indirect route via (D)COM. +

+ + + + + + +
+ + + +

8.2.1 Using (D)COM

+ +

(D)COM is a standard Windows mechanism used for communication +between Windows applications. One application (here R) is run as COM +server which offers services to clients, here the front-end calling +application. The services are described in a ‘Type Library’ and are +(more or less) language-independent, so the calling application can be +written in C or C++ or Visual Basic or Perl or Python and so on. +The ‘D’ in (D)COM refers to ‘distributed’, as the client and server can +be running on different machines. +

+

The basic R distribution is not a (D)COM server, but two addons are +currently available that interface directly with R and provide a +(D)COM server: +

+
+ + + +

8.2.2 Calling R.dll directly

+ +

The R DLL is mainly written in C and has _cdecl entry +points. Calling it directly will be tricky except from C code (or C++ +with a little care). +

+

There is a version of the Unix-alike interface calling +

+
+
int Rf_initEmbeddedR(int ac, char **av);
+void Rf_endEmbeddedR(int fatal);
+
+ +

which is an entry point in R.dll. Examples of its use (and a +suitable Makefile.win) can be found in the tests/Embedding +directory of the sources. You may need to ensure that +R_HOME/bin is in your PATH so the R DLLs are found. +

+

Examples of calling R.dll directly are provided in the directory +src/gnuwin32/front-ends, including a simple command-line +front end rtest.c whose code is +

+
+
#define Win32
+#include <windows.h>
+#include <stdio.h>
+#include <Rversion.h>
+#define LibExtern __declspec(dllimport) extern
+#include <Rembedded.h>
+#include <R_ext/RStartup.h>
+/* for askok and askyesnocancel */
+#include <graphapp.h>
+
+/* for signal-handling code */
+#include <psignal.h>
+
+/* simple input, simple output */
+
+/* This version blocks all events: a real one needs to call ProcessEvents
+   frequently. See rterm.c and ../system.c for one approach using
+   a separate thread for input.
+*/
+int myReadConsole(const char *prompt, char *buf, int len, int addtohistory)
+{
+    fputs(prompt, stdout);
+    fflush(stdout);
+    if(fgets(buf, len, stdin)) return 1; else return 0;
+}
+
+void myWriteConsole(const char *buf, int len)
+{
+    printf("%s", buf);
+}
+
+void myCallBack(void)
+{
+    /* called during i/o, eval, graphics in ProcessEvents */
+}
+
+void myBusy(int which)
+{
+    /* set a busy cursor ... if which = 1, unset if which = 0 */
+}
+
+static void my_onintr(int sig) { UserBreak = 1; }
+
+int main (int argc, char **argv)
+{
+    structRstart rp;
+    Rstart Rp = &rp;
+    char Rversion[25], *RHome;
+
+    sprintf(Rversion, "%s.%s", R_MAJOR, R_MINOR);
+    if(strcmp(getDLLVersion(), Rversion) != 0) {
+        fprintf(stderr, "Error: R.DLL version does not match\n");
+        exit(1);
+    }
+
+    R_setStartTime();
+    R_DefParams(Rp);
+    if((RHome = get_R_HOME()) == NULL) {
+        fprintf(stderr, "R_HOME must be set in the environment or Registry\n");
+        exit(1);
+    }
+    Rp->rhome = RHome;
+    Rp->home = getRUser();
+    Rp->CharacterMode = LinkDLL;
+    Rp->ReadConsole = myReadConsole;
+    Rp->WriteConsole = myWriteConsole;
+    Rp->CallBack = myCallBack;
+    Rp->ShowMessage = askok;
+    Rp->YesNoCancel = askyesnocancel;
+    Rp->Busy = myBusy;
+
+    Rp->R_Quiet = TRUE;        /* Default is FALSE */
+    Rp->R_Interactive = FALSE; /* Default is TRUE */
+    Rp->RestoreAction = SA_RESTORE;
+    Rp->SaveAction = SA_NOSAVE;
+    R_SetParams(Rp);
+    R_set_command_line_arguments(argc, argv);
+
+    FlushConsoleInputBuffer(GetStdHandle(STD_INPUT_HANDLE));
+
+    signal(SIGBREAK, my_onintr);
+    GA_initapp(0, 0);
+    readconsolecfg();
+    setup_Rmainloop();
+#ifdef SIMPLE_CASE
+    run_Rmainloop();
+#else
+    R_ReplDLLinit();
+    while(R_ReplDLLdo1() > 0) {
+/* add user actions here if desired */
+    }
+/* only get here on EOF (not q()) */
+#endif
+    Rf_endEmbeddedR(0);
+    return 0;
+}
+
+ +

The ideas are +

+
    +
  • Check that the front-end and the linked R.dll match – other +front-ends may allow a looser match. + +
  • Find and set the R home directory and the user’s home directory. The +former may be available from the Windows Registry: it will be in +HKEY_LOCAL_MACHINE\Software\R-core\R\InstallPath from an +administrative install and +HKEY_CURRENT_USER\Software\R-core\R\InstallPath otherwise, if +selected during installation (as it is by default). + +
  • Define startup conditions and callbacks via the Rstart structure. +R_DefParams sets the defaults, and R_SetParams sets +updated values. + +
  • Record the command-line arguments used by +R_set_command_line_arguments for use by the R function +commandArgs(). + +
  • Set up the signal handler and the basic user interface. + +
  • Run the main R loop, possibly with our actions intermeshed. + +
  • Arrange to clean up. +
+ +

An underlying theme is the need to keep the GUI ‘alive’, and this has +not been done in this example. The R callback R_ProcessEvents +needs to be called frequently to ensure that Windows events in R +windows are handled expeditiously. Conversely, R needs to allow the +GUI code (which is running in the same process) to update itself as +needed – two ways are provided to allow this: +

+
    +
  • R_ProcessEvents calls the callback registered by +Rp->callback. A version of this is used to run package Tcl/Tk +for tcltk under Windows, for the code is + +
    +
    void R_ProcessEvents(void)
    +{
    +    while (peekevent()) doevent(); /* Windows events for GraphApp */
    +    if (UserBreak) { UserBreak = FALSE; onintr(); }
    +    R_CallBackHook();
    +    if(R_tcldo) R_tcldo();
    +}
    +
    + +
  • The mainloop can be split up to allow the calling application to take +some action after each line of input has been dealt with: see the +alternative code below #ifdef SIMPLE_CASE. +
+ +

It may be that no R GraphApp windows need to be considered, although +these include pagers, the windows() graphics device, the R +data and script editors and various popups such as choose.file() +and select.list(). It would be possible to replace all of these, +but it seems easier to allow GraphApp to handle most of them. +

+

It is possible to run R in a GUI in a single thread (as +RGui.exe shows) but it will normally be easier117 to +use multiple threads. +

+

Note that R’s own front ends use a stack size of 10Mb, whereas MinGW +executables default to 2Mb, and Visual C++ ones to 1Mb. The latter +stack sizes are too small for a number of R applications, so +general-purpose front-ends should use a larger stack size. +

+ +
+ + + +

8.2.3 Finding R_HOME

+ +

Both applications which embed R and those which use a system +call to invoke R (as Rscript.exe, Rterm.exe or +R.exe) need to be able to find the R bin directory. +The simplest way to do so is the ask the user to set an environment +variable R_HOME and use that, but naive users may be flummoxed as +to how to do so or what value to use. +

+

The R for Windows installers have for a long time allowed the value +of R_HOME to be recorded in the Windows Registry: this is +optional but selected by default. Where it is recorded has +changed over the years to allow for multiple versions of R to be +installed at once, and to allow 32- and 64-bit versions of R to be +installed on the same machine. +

+

The basic Registry location is Software\R-core\R. For an +administrative install this is under HKEY_LOCAL_MACHINE and on a +64-bit OS HKEY_LOCAL_MACHINE\Software\R-core\R is by default +redirected for a 32-bit application, so a 32-bit application will see +the information for the last 32-bit install, and a 64-bit application +that for the last 64-bit install. For a personal install, the +information is under HKEY_CURRENT_USER\Software\R-core\R which is +seen by both 32-bit and 64-bit applications and so records the last +install of either architecture. To circumvent this, there are locations +Software\R-core\R32 and Software\R-core\R64 which always +refer to one architecture. +

+

When R is installed and recording is not disabled then two string +values are written at that location for keys InstallPath and +Current Version, and these keys are removed when R is +uninstalled. To allow information about other installed versions to be +retained, there is also a key named something like 3.0.0 or +3.0.0 patched or 3.1.0 Pre-release with a value for +InstallPath. +

+

So a comprehensive algorithm to search for R_HOME is something +like +

+
    +
  • Decide which of personal or administrative installs should have +precedence. There are arguments both ways: we find that with roaming +profiles that HKEY_CURRENT_USER\Software often gets reverted to +an earlier version. Do the following for one or both of +HKEY_CURRENT_USER and HKEY_LOCAL_MACHINE. + +
  • If the desired architecture is known, look in Software\R-core\R32 +or Software\R-core\R64, and if that does not exist or the +architecture is immaterial, in Software\R-core\R. + +
  • If key InstallPath exists then this is R_HOME (recorded +using backslashes). If it does not, look for version-specific keys like +2.11.0 alpha, pick the latest (which is of itself a complicated +algorithm as 2.11.0 patched > 2.11.0 > 2.11.0 alpha > 2.8.1) and +use its value for InstallPath. +
+ +

Prior to R 2.12.0 R.dll and the various front-end executables +were in R_HOME\bin, but they are now in R_HOME\bin\i386 or +R_HOME\bin\x64. So you may need to arrange to look first in the +architecture-specific subdirectory and then in R_HOME\bin. +

+
+ + + +

Function and variable index

+ +
Jump to:   . +   +\ +   +
+A +   +B +   +C +   +D +   +E +   +F +   +G +   +I +   +L +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +V +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

.
.C: Interface functions .C and .Fortran
.Call: Handling R objects in C
.Call: Calling .Call
.External: Handling R objects in C
.External: Calling .External
.Fortran: Interface functions .C and .Fortran
.Last.lib: Load hooks
.onAttach: Load hooks
.onDetach: Load hooks
.onLoad: Load hooks
.onUnload: Load hooks
.Random.seed: Random numbers

\
\acronym: Marking text
\alias: Documenting functions
\arguments: Documenting functions
\author: Documenting functions
\bold: Marking text
\cite: Marking text
\code: Marking text
\command: Marking text
\concept: Indices
\cr: Sectioning
\CRANpkg{pkg}: User-defined macros
\deqn: Mathematics
\describe: Lists and tables
\description: Documenting functions
\details: Documenting functions
\dfn: Marking text
\doi{numbers}: User-defined macros
\dontrun: Documenting functions
\dontshow: Documenting functions
\dots: Insertions
\dQuote: Marking text
\email: Marking text
\emph: Marking text
\enc: Insertions
\enumerate: Lists and tables
\env: Marking text
\eqn: Mathematics
\examples: Documenting functions
\figure: Figures
\file: Marking text
\format: Documenting data sets
\href: Marking text
\if: Conditional text
\ifelse: Conditional text
\itemize: Lists and tables
\kbd: Marking text
\keyword: Documenting functions
\ldots: Insertions
\link: Cross-references
\method: Documenting functions
\name: Documenting functions
\newcommand: User-defined macros
\note: Documenting functions
\option: Marking text
\out: Conditional text
\packageAuthor: User-defined macros
\packageDescription: User-defined macros
\packageDESCRIPTION: User-defined macros
\packageIndices: User-defined macros
\packageMaintainer: User-defined macros
\packageTitle: User-defined macros
\pkg: Marking text
\preformatted: Marking text
\R: Insertions
\RdOpts: Dynamic pages
\references: Documenting functions
\renewcommand: User-defined macros
\S3method: Documenting functions
\samp: Marking text
\section: Sectioning
\seealso: Documenting functions
\Sexpr: Dynamic pages
\source: Documenting data sets
\sQuote: Marking text
\sspace: User-defined macros
\strong: Marking text
\tabular: Lists and tables
\title: Documenting functions
\url: Marking text
\usage: Documenting functions
\value: Documenting functions
\var: Marking text
\verb: Marking text

A
allocVector: Allocating storage
AUTHORS: Package subdirectories

B
bessel_i: Mathematical functions
bessel_i: Mathematical functions
bessel_j: Mathematical functions
bessel_j: Mathematical functions
bessel_k: Mathematical functions
bessel_k: Mathematical functions
bessel_y: Mathematical functions
bessel_y: Mathematical functions
beta: Mathematical functions
beta: Mathematical functions
BLAS_LIBS: Using Makevars
browser: Browsing

C
Calloc: User-controlled memory
CAR: Calling .External
CDR: Calling .External
cgmin: Optimization
choose: Mathematical functions
choose: Mathematical functions
CITATION: Package subdirectories
CITATION: Preparing translations
COPYRIGHTS: The DESCRIPTION file
COPYRIGHTS: Package subdirectories
cospi: Numerical Utilities
cPsort: Utility functions

D
debug: Debugging R code
debugger: Debugging R code
defineVar: Finding and setting variables
digamma: Mathematical functions
digamma: Mathematical functions
dump.frames: Debugging R code
duplicate: Named objects and copying
dyn.load: dyn.load and dyn.unload
dyn.unload: dyn.load and dyn.unload

E
expm1: Numerical Utilities
export: Specifying imports and exports
exportClasses: Namespaces with S4 classes and methods
exportClassPattern: Namespaces with S4 classes and methods
exportMethods: Namespaces with S4 classes and methods
exportPattern: Specifying imports and exports
exportPattern: Namespaces with S4 classes and methods
exp_rand: Random numbers

F
FALSE: Mathematical constants
findInterval: Utility functions
findVar: Finding and setting variables
FLIBS: Using Makevars
fmax2: Numerical Utilities
fmin2: Numerical Utilities
fprec: Numerical Utilities
Free: User-controlled memory
fround: Numerical Utilities
fsign: Numerical Utilities
ftrunc: Numerical Utilities

G
gammafn: Mathematical functions
gammafn: Mathematical functions
gctorture: Using gctorture
getAttrib: Attributes
getCharCE: Character encoding issues
GetRNGstate: Random numbers

I
imax2: Numerical Utilities
imin2: Numerical Utilities
import: Specifying imports and exports
importClassesFrom: Namespaces with S4 classes and methods
importFrom: Specifying imports and exports
importMethodsFrom: Namespaces with S4 classes and methods
install: Attributes
iPsort: Utility functions
ISNA: Missing and special values
ISNA: Missing and IEEE values
ISNAN: Missing and special values
ISNAN: Missing and IEEE values

L
LAPACK_LIBS: Using Makevars
lbeta: Mathematical functions
lbeta: Mathematical functions
lbfgsb: Optimization
lchoose: Mathematical functions
lchoose: Mathematical functions
lgamma1p: Numerical Utilities
lgammafn: Mathematical functions
lgammafn: Mathematical functions
library.dynam: Package subdirectories
library.dynam: dyn.load and dyn.unload
log1p: Numerical Utilities
log1pexp: Numerical Utilities
log1pmx: Numerical Utilities
logspace_add: Numerical Utilities
logspace_sub: Numerical Utilities
logspace_sum: Numerical Utilities

M
mkChar: Handling character data
mkCharCE: Character encoding issues
mkCharLen: Handling character data
mkCharLenCE: Character encoding issues
M_E: Mathematical constants
M_PI: Mathematical constants

N
NA_REAL: Missing and IEEE values
NEWS.Rd: Package subdirectories
nmmin: Optimization
norm_rand: Random numbers

O
OBJECTS: Using Makevars
OBJECTS: Creating shared objects

P
pentagamma: Mathematical functions
pentagamma: Mathematical functions
PKG_CFLAGS: Creating shared objects
PKG_CPPFLAGS: Creating shared objects
PKG_CXXFLAGS: Creating shared objects
PKG_FCFLAGS: Creating shared objects
PKG_FFLAGS: Creating shared objects
PKG_LIBS: Creating shared objects
PKG_OBJCFLAGS: Creating shared objects
PKG_OBJCXXFLAGS: Creating shared objects
prompt: Rd format
PROTECT: Garbage Collection
PROTECT_WITH_INDEX: Garbage Collection
psigamma: Mathematical functions
psigamma: Mathematical functions
PutRNGstate: Random numbers

Q
qsort3: Utility functions
qsort4: Utility functions

R
R CMD build: Building package tarballs
R CMD check: Checking packages
R CMD config: Configure and cleanup
R CMD Rd2pdf: Processing documentation files
R CMD Rdconv: Processing documentation files
R CMD SHLIB: Creating shared objects
R CMD Stangle: Processing documentation files
R CMD Sweave: Processing documentation files
Rdqagi: Integration
Rdqags: Integration
Realloc: User-controlled memory
recover: Debugging R code
reEnc: Character encoding issues
REprintf: Printing
REPROTECT: Garbage Collection
REvprintf: Printing
revsort: Utility functions
Riconv: Re-encoding
Riconv_close: Re-encoding
Riconv_open: Re-encoding
Rprintf: Printing
Rprof: Profiling R code for speed
Rprof: Memory statistics from Rprof
Rprofmem: Tracking memory allocations
rPsort: Utility functions
rsort_with_index: Utility functions
Rvprintf: Printing
R_addhistory: Setting R callbacks
R_alloc: Transient storage allocation
R_allocLD: Transient storage allocation
R_Busy: Setting R callbacks
R_ChooseFile: Setting R callbacks
R_CleanUp: Setting R callbacks
R_ClearErrConsole: Setting R callbacks
R_csort: Utility functions
R_dataentry: Setting R callbacks
R_dataviewer: Setting R callbacks
R_EditFile: Setting R callbacks
R_EditFiles: Setting R callbacks
R_ExpandFileName: Utility functions
R_FINITE: Missing and IEEE values
R_FlushConsole: Setting R callbacks
R_GetCCallable: Linking to native routines in other packages
R_GetCurrentSrcref: Accessing source references
R_GetSrcFilename: Accessing source references
R_INLINE: Inlining C functions
R_IsNaN: Missing and IEEE values
R_isort: Utility functions
R_LIBRARY_DIR: Configure and cleanup
R_loadhistory: Setting R callbacks
R_max_col: Utility functions
R_NegInf: Missing and IEEE values
R_orderVector: Utility functions
R_PACKAGE_DIR: Configure and cleanup
R_PACKAGE_NAME: Configure and cleanup
R_ParseVector: Parsing R code from C
R_PosInf: Missing and IEEE values
R_pow: Numerical Utilities
R_pow_di: Numerical Utilities
R_PreserveObject: Garbage Collection
R_qsort: Utility functions
R_qsort_I: Utility functions
R_qsort_int: Utility functions
R_qsort_int_I: Utility functions
R_ReadConsole: Setting R callbacks
R_RegisterCCallable: Linking to native routines in other packages
R_registerRoutines: Registering native routines
R_ReleaseObject: Garbage Collection
R_ResetConsole: Setting R callbacks
R_rsort: Utility functions
R_savehistory: Setting R callbacks
R_selectlist: Setting R callbacks
R_ShowFiles: Setting R callbacks
R_ShowMessage: Setting R callbacks
R_Srcref: Accessing source references
R_Suicide: Setting R callbacks
R_tmpnam: Utility functions
R_tmpnam2: Utility functions
R_Version: Platform and version information
R_WriteConsole: Setting R callbacks
R_WriteConsoleEx: Setting R callbacks

S
S3method: Registering S3 methods
SAFE_FFLAGS: Using Makevars
samin: Optimization
seed_in: Random numbers
seed_out: Random numbers
setAttrib: Attributes
setVar: Finding and setting variables
sign: Numerical Utilities
sinpi: Numerical Utilities
summaryRprof: Memory statistics from Rprof
system: Operating system access
system.time: Operating system access
system2: Operating system access
S_alloc: Transient storage allocation
S_realloc: Transient storage allocation

T
tanpi: Numerical Utilities
tetragamma: Mathematical functions
tetragamma: Mathematical functions
trace: Debugging R code
traceback: Debugging R code
tracemem: Tracing copies of an object
translateChar: Character encoding issues
translateCharUTF8: Character encoding issues
trigamma: Mathematical functions
trigamma: Mathematical functions
TRUE: Mathematical constants

U
undebug: Debugging R code
unif_rand: Random numbers
UNPROTECT: Garbage Collection
UNPROTECT_PTR: Garbage Collection
untracemem: Tracing copies of an object
useDynLib: useDynLib

V
vmaxget: Transient storage allocation
vmaxset: Transient storage allocation
vmmin: Optimization

+
Jump to:   . +   +\ +   +
+A +   +B +   +C +   +D +   +E +   +F +   +G +   +I +   +L +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +V +   +
+ +
+ +
+

+Previous: , Up: Top   [Contents][Index]

+
+ +

Concept index

+ +
Jump to:   . +   +\ +   +
+A +   +B +   +C +   +D +   +E +   +F +   +G +   +H +   +I +   +L +   +M +   +N +   +O +   +P +   +R +   +S +   +T +   +U +   +V +   +W +   +Z +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

.
.install_extras file: Writing package vignettes
.Rbuildignore file: Building package tarballs
.Rinstignore file: Package subdirectories

\
\linkS4class: Cross-references

A
Allocating storage: Allocating storage
Attributes: Attributes

B
Bessel functions: Mathematical functions
Beta function: Mathematical functions
Building binary packages: Building binary packages
Building source packages: Building package tarballs

C
C++ code, interfacing: Interfacing C++ code
Calling C from FORTRAN and vice versa: Calling C from FORTRAN and vice versa
Checking packages: Checking packages
citation: Package subdirectories
citation: Preparing translations
Classes: Classes
cleanup file: Package structure
conditionals: Conditional text
configure file: Package structure
Copying objects: Named objects and copying
CRAN: Creating R packages
Creating packages: Creating R packages
Creating shared objects: Creating shared objects
Cross-references in documentation: Cross-references
cumulative hazard: Distribution functions

D
Debugging: Debugging compiled code
DESCRIPTION file: The DESCRIPTION file
Details of R types: Details of R types
Distribution functions from C: Distribution functions
Documentation, writing: Writing R documentation files
Dynamic loading: dyn.load and dyn.unload
dynamic pages: Dynamic pages

E
Editing Rd files: Editing Rd files
encoding: Encoding
Error handling from C: Error handling
Error handling from FORTRAN: Error handling from FORTRAN
Evaluating R expressions from C: Evaluating R expressions from C
external pointer: External pointers and weak references

F
Figures in documentation: Figures
finalizer: External pointers and weak references
Finding variables: Finding and setting variables

G
Gamma function: Mathematical functions
Garbage collection: Garbage Collection
Generic functions: Generic functions and methods

H
handling character data: Handling character data
Handling lists: Handling lists
Handling R objects in C: Handling R objects in C

I
IEEE special values: Missing and special values
IEEE special values: Missing and IEEE values
INDEX file: The INDEX file
Indices: Indices
Inspecting R objects when debugging: Inspecting R objects
integration: Integration
Interfaces to compiled code: Interface functions .C and .Fortran
Interfaces to compiled code: Interface functions .Call and .External
Interfacing C++ code: Interfacing C++ code
Interrupts: Allowing interrupts

L
LICENCE file: Licensing
LICENSE file: Licensing
Lists and tables in documentation: Lists and tables

M
Marking text in documentation: Marking text
Mathematics in documentation: Mathematics
Memory allocation from C: Memory allocation
Memory use: Profiling R code for memory use
Method functions: Generic functions and methods
Missing values: Missing and special values
Missing values: Missing and IEEE values

N
namespaces: Package namespaces
news: Package subdirectories
Numerical analysis subroutines from C: Numerical analysis subroutines
Numerical derivatives: Calculating numerical derivatives

O
OpenMP: OpenMP support
OpenMP: Platform and version information
Operating system access: Operating system access
optimization: Optimization

P
Package builder: Building package tarballs
Package structure: Package structure
Package subdirectories: Package subdirectories
Packages: Creating R packages
Parsing R code from C: Parsing R code from C
Platform-specific documentation: Platform-specific sections
Printing from C: Printing
Printing from FORTRAN: Printing from FORTRAN
Processing Rd format: Processing documentation files
Profiling: Profiling R code for speed
Profiling: Profiling R code for memory use
Profiling: Profiling compiled code

R
Random numbers in C: Random numbers
Random numbers in C: Distribution functions
Random numbers in FORTRAN: Calling C from FORTRAN and vice versa
Registering native routines: Registering native routines

S
Setting variables: Finding and setting variables
Sort functions from C: Utility functions
Sweave: Writing package vignettes

T
tarballs: Building package tarballs
Tidying R code: Tidying R code

U
user-defined macros: User-defined macros

V
Version information from C: Platform and version information
vignettes: Writing package vignettes
Visibility: Controlling visibility

W
weak reference: External pointers and weak references

Z
Zero-finding: Zero-finding

+
Jump to:   . +   +\ +   +
+A +   +B +   +C +   +D +   +E +   +F +   +G +   +H +   +I +   +L +   +M +   +N +   +O +   +P +   +R +   +S +   +T +   +U +   +V +   +W +   +Z +   +
+ +
+
+

Footnotes

+ +

(1)

+

although this is a persistent +mis-usage. It seems to stem from S, whose analogues of R’s packages +were officially known as library sections and later as +chapters, but almost always referred to as libraries.

+

(2)

+

This +seems to be commonly used for a file in ‘markdown’ format. Be aware +that most users of R will not know that, nor know how to view such a +file: platforms such as OS X and Windows do not have a default viewer +set in their file associations. The CRAN package web pages +render such files in HTML: the converter used expects the file to be +encoded in UTF-8.

+

(3)

+

currently, top-level files +.Rbuildignore and .Rinstignore, and +vignettes/.install_extras.

+

(4)

+

false positives are possible, but only a handful have been +seen so far.

+

(5)

+

at least if this +is done in a locale which matches the package encoding.

+

(6)

+

and +required by CRAN, so checked by R CMD check +--as-cran.

+

(7)

+

But it is checked for Open Source packages +by R CMD check --as-cran.

+

(8)

+

Duplicate +definitions may trigger a warning: see User-defined macros.

+

(9)

+

even one wrapped in \donttest.

+

(10)

+

This includes all packages +directly called by library and require calls, as well as +data obtained via data(theirdata, package = "somepkg") +calls: R CMD check will warn about all of these. But there +are subtler uses which it will not detect: e.g. if package A uses +package B and makes use of functionality in package B which uses package +C which package B suggests or enhances, then package C needs to be in +the ‘Suggests’ list for package A. Nor will undeclared uses in +included files be reported, nor unconditional uses of packages listed +under ‘Enhances’.

+

(11)

+

Extensions +.S and .s arise from code originally written for S(-PLUS), +but are commonly used for assembler code. Extension .q was used +for S, which at one time was tentatively called QPE.

+

(12)

+

but they should be in the encoding +declared in the DESCRIPTION file.

+

(13)

+

This is true for OSes which +implement the ‘C’ locale: Windows’ idea of the ‘C’ locale uses +the WinAnsi charset.

+

(14)

+

More precisely, they can +contain the English alphanumeric characters and the symbols +‘$ - _ . + ! ' ( ) , ;  = &’.

+

(15)

+

Note that Ratfor is not supported. +If you have Ratfor source code, you need to convert it to FORTRAN. Only +FORTRAN 77 (which we write in upper case) is supported on all platforms, +but most also support Fortran-95 (for which we use title case). If you +want to ship Ratfor source files, please do so in a subdirectory of +src and not in the main subdirectory.

+

(16)

+

either or both of which may not be supported on particular +platforms

+

(17)

+

Using .hpp is not guaranteed to be +portable.

+

(18)

+

There +is also ‘__APPLE_CC__’, but that indicates a compiler with +Apple-specific features, not the OS. It is used in +Rinlinedfuns.h.

+

(19)

+

the POSIX +terminology, called ‘make variables’ by GNU make.

+

(20)

+

on all platforms from R 3.1.0

+

(21)

+

The best way to generate such a +file is to copy the .Rout from a successful run of R CMD +check. If you want to generate it separately, do run R with options +--vanilla --slave and with environment variable +LANGUAGE=en set to get messages in English. Be careful not to use +output with the option --timings (and note that +--as-cran sets it).

+

(22)

+

e.g. +https://tools.ietf.org/html/rfc4180.

+

(23)

+

People who have trouble with +case are advised to use .rda as a common error is to refer to +abc.RData as abc.Rdata!

+

(24)

+

in POSIX parlance: GNU make +calls these ‘make variables’.

+

(25)

+

at least on Unix-alikes: the Windows build currently +resolves such dependencies to a static FORTRAN library when +Rblas.dll is built.

+

(26)

+

http://www.openmp.org/, +https://en.wikipedia.org/wiki/OpenMP, +https://computing.llnl.gov/tutorials/openMP/

+

(27)

+

Some builds of clang 3.7 +have support for OpenMP 3.1

+

(28)

+

Windows default, not MinGW-w64 default.

+

(29)

+

Which it was at the time of writing with GCC, +Solaris Studio, Intel and Clang 3.7.x compilers.

+

(30)

+

some Windows toolchains have the +typo ‘_REENTRANCE’ instead.

+

(31)

+

Cygwin used g77 up to 2011, and some pre-built +versions of R for Unix OSes still do.

+

(32)

+

For +details of these and related macros, see file config.site in +the R sources.

+

(33)

+

On systems which use sub-architectures, +architecture-specific versions such as ~/.R/check.Renviron.i386 +take precedence.

+

(34)

+

A suitable file.exe is +part of the Windows toolset: it checks for gfile if a suitable +file is not found: the latter is available in the OpenCSW +collection for Solaris at http://www.opencsw.org. The source +repository is ftp://ftp.astron.com/pub/file/.

+

(35)

+

An exception is made +for subdirectories with names starting ‘win’ or ‘Win’.

+

(36)

+

on most other platforms such runtime +libraries are dynamic, but static libraries are currently used on +Windows because the toolchain is not a standard part of the OS.

+

(37)

+

or if option --use-valgrind is +used or environment variable _R_CHECK_ALWAYS_LOG_VIGNETTE_OUTPUT_ +is set to a true value or if there are differences from a target output +file

+

(38)

+

For example, in early +2014 gdata declared ‘Imports: gtools’ and gtools +declared ‘Imports: gdata’.

+

(39)

+

loading, examples, +tests, running vignette code

+

(40)

+

on all platforms from +R 3.1.0.

+

(41)

+

called CVS or .svn or +.arch-ids or .bzr or .git (but not files called +.git) or .hg.

+

(42)

+

called +.metadata.

+

(43)

+

which is +an error: GNU make uses GNUmakefile.

+

(44)

+

and to avoid problems with case-insensitive file +systems, lower-case versions of all these extensions.

+

(45)

+

unless inhibited by using +‘BuildVignettes: no’ in the DESCRIPTION file.

+

(46)

+

provided the conditions of the +package’s license are met: many, including CRAN, see the +omission of source components as incompatible with an Open Source +license.

+

(47)

+

R_HOME/bin is prepended to the +PATH so that references to R or Rscript in the +Makefile do make use of the currently running version of R.

+

(48)

+

Note that +lazy-loaded datasets are not in the package’s namespace so need +to be accessed via ::, e.g. +survival::survexp.us.

+

(49)

+

they will be called +with two unnamed arguments, in that order.

+

(50)

+

NB: this will only be read in all versions of R if +the package contains R code in a R directory.

+

(51)

+

Note that this is the +basename of the shared object, and the appropriate extension (.so +or .dll) will be added.

+

(52)

+

This was necessary at least prior to +R 3.0.2 as the methods package looked for its own R code on +the search path.

+

(53)

+

This defaults to the same +pattern as exportPattern: use something like +exportClassPattern("^$") to override this.

+

(54)

+

if it does, there will be opaque warnings about +replacing imports if the classes/methods are also imported.

+

(55)

+

People use dev.new() to open a +device at a particular size: that is not portable but using +dev.new(noRStudioGD = TRUE) helps.

+

(56)

+

Solaris make does not accept +CRLF-terminated Makefiles; Solaris warns about and some other +makes ignore incomplete final lines.

+

(57)

+

This was apparently introduced +in SunOS 4, and is available elsewhere provided it is surrounded +by spaces.

+

(58)

+

GNU make, +BSD make formerly in FreeBSD and OS X, AT&T make as implemented on +Solaris, pmake in FreeBSD, ‘Distributed Make’ (dmake), +part of Solaris Studio and available in other versions.

+

(59)

+

For example, test +options -a and -e are not portable, and not supported +in the AT&T Bourne shell used on Solaris, even though they are in the +POSIX standard.

+

(60)

+

but +note that long long is not a standard C++ type, and C++ compilers +set up for strict checking will reject it.

+

(61)

+

or where supported the variants _Exit and +_exit.

+

(62)

+

This and +srandom are in any case not portable. They are in POSIX but not +in the C99 standard, and not available on Windows.

+

(63)

+

in libselinux.

+

(64)

+

except perhaps the simplest kind as used by +download.file() in non-interactive use.

+

(65)

+

Whereas the GNU linker reorders so -L options +are processed first, the Solaris one does not.

+

(66)

+

some versions of OS X did not.

+

(67)

+

Not doing so is the +default on Windows, overridden for the R executables. It is also the +default on some Solaris compilers.

+

(68)

+

These are not needed for the default compiler settings +on ‘x86_64’ but are likely to be needed on ‘ix86’.

+

(69)

+

Select ‘Save as’, and select +‘Reduce file size’ from the ‘Quartz filter’ menu’: this can be accessed +in other ways, for example by Automator.

+

(70)

+

except perhaps some +special characters such as backslash and hash which may be taken over +for currency symbols.

+

(71)

+

Typically on a Unix-alike this is done by telling +fontconfig where to find suitable fonts to select glyphs +from.

+

(72)

+

this object is available since R 2.8.0, so the +‘Depends’ field in the DESCRIPTION file should contain +something at least as restrictive as ‘R (>= 2.8’.

+

(73)

+

e.g. \alias, \keyword and +\note sections.

+

(74)

+

There can be exceptions: for example +Rd files are not allowed to start with a dot, and have to be +uniquely named on a case-insensitive file system.

+

(75)

+

in the current locale, and with special +treatment for LaTeX special characters and with any +‘pkgname-package’ topic moved to the top of the list.

+

(76)

+

Text between or after list items is discouraged.

+

(77)

+

as defined by the R function +trimws.

+

(78)

+

Currently it is +rendered differently only in HTML conversions, and LaTeX conversion +outside ‘\usage’ and ‘\examples’ environments.

+

(79)

+

a common +example in CRAN packages is \link[mgcv]{gam}.

+

(80)

+

There is only a fine +distinction between \dots and \ldots. It is technically +incorrect to use \ldots in code blocks and tools::checkRd +will warn about this—on the other hand the current converters treat +them the same way in code blocks, and elsewhere apart from the small +distinction between the two in LaTeX.

+

(81)

+

See the +examples section in the file Paren.Rd for an example.

+

(82)

+

R +2.9.0 added support for UTF-8 Cyrillic characters in LaTeX, but on +some OSes this will need Cyrillic support added to LaTeX, so +environment variable _R_CYRILLIC_TEX_ may need to be set to a +non-empty value to enable this.

+

(83)

+

R +has to be built to enable this, but the option +--enable-R-profiling is the default.

+

(84)

+

For Unix-alikes these are intervals of CPU +time, and for Windows of elapsed time.

+

(85)

+

With the exceptions of the commands +listed below: an object of such a name can be printed via an +explicit call to print.

+

(86)

+

at the time of writing mainly for 10.9 with some +support for 10.8, none for the current 10.10.

+

(87)

+

Those in some numeric, logical, +integer, raw, complex vectors and in memory allocated by +R_alloc.

+

(88)

+

including using the data sections of R vectors after +they are freed.

+

(89)

+

small +fixed-size arrays by default in gfortran, for example.

+

(90)

+

currently only on ‘ix86’/‘x86_64’ Linux +and OS X (including the builds in Xcode 7 beta but not earlier Apple +releases). On some platforms, e.g. Fedora, the runtime library, +libasan, needs to be installed separately. OS X users can install +a suitable clang from the sources, +http://llvm.org/releases/ or possibly distributions such as +MacPorts or Homebrew.

+

(91)

+

part of the LLVM project and +in distributed in llvm RPMs and .debs on Linux. It is not +currently shipped by Apple.

+

(92)

+

as Ubuntu does.

+

(93)

+

installed on some Linux systems as +asan_symbolize, and obtainable from +https://llvm.org/svn/llvm-project/compiler-rt/trunk/lib/asan/scripts/asan_symbolize.py: +it makes use of llvm-symbolizer if available.

+

(94)

+

e.g. src/main/dotcode.c and parts of the +Matrix sources with clang 3.7.0).

+

(95)

+

or +the user manual for your version of clang, e.g. +http://llvm.org/releases/3.6.2/tools/docs/UsersManual.html.

+

(96)

+

This includes the C++ +UBSAN handlers, despite its name.

+

(97)

+

but +works better if inlining and frame pointer optimizations are disabled.

+

(98)

+

possibly after some platform-specific +translation, e.g. adding leading or trailing underscores.

+

(99)

+

Note that this is then not checked for over-runs by +option CBoundsCheck = TRUE.

+

(100)

+

but this is not currently done.

+

(101)

+

whether or not ‘LinkingTo’ is used.

+

(102)

+

so there needs to be a corresponding import or +importFrom entry in the NAMESPACE file.

+

(103)

+

dyld on OS X, +and DYLD_LIBRARY_PATHS below.

+

(104)

+

That is, +similar to those defined in S version 4 from the 1990s: these are +not kept up to date and are not recommended for new projects.

+

(105)

+

see The R API: note that these are not all part of +the API.

+

(106)

+

SEXP is an acronym for Simple +EXPression, common in LISP-like language syntaxes.

+

(107)

+

If no coercion was required, coerceVector would +have passed the old object through unchanged.

+

(108)

+

You can assign a copy of the object in the +environment frame rho using defineVar(symbol, +duplicate(value), rho)).

+

(109)

+

see Character encoding issues for why this +might not be what is required.

+

(110)

+

This is only guaranteed to show the +current interface: it is liable to change.

+

(111)

+

Known problems are redefining +LENGTH, error, length, vector and +warning

+

(112)

+

It is an optional C11 +extension.

+

(113)

+

https://en.wikipedia.org/wiki/Endianness.

+

(114)

+

In the parlance of OS X +this is a dynamic library, and is the normal way to build R on +that platform.

+

(115)

+

but these +are not part of the automated test procedures and so little tested.

+

(116)

+

at least on platforms where the values are +available, that is having getrlimit and on Linux or having +sysctl supporting KERN_USRSTACK, including FreeBSD and OS +X.

+

(117)

+

An +attempt to use only threads in the late 1990s failed to work correctly +under Windows 95, the predominant version of Windows at that time.

+
+
+ + + + + diff --git a/R-intro.html b/R-intro.html new file mode 100644 index 0000000..fbf480d --- /dev/null +++ b/R-intro.html @@ -0,0 +1,9601 @@ + + + + + +An Introduction to R + + + + + + + + + + + + + + + + +

An Introduction to R

+ + + + + + + + + + + + + + + + + + + + + + + + + + +

Table of Contents

+ +
+ + +
+ + + +
+

+Next:   [Contents][Index]

+
+ +

An Introduction to R

+ +

This is an introduction to R (“GNU S”), a language and environment for +statistical computing and graphics. R is similar to the +award-winning1 S +system, which was developed at Bell Laboratories by John Chambers et al. +It provides a wide variety of statistical and graphical techniques +(linear and nonlinear modelling, statistical tests, time series +analysis, classification, clustering, ...). +

+

This manual provides information on data types, programming elements, +statistical modelling and graphics. +

+

This manual is for R, version 3.2.3 (2015-12-10). +

+

Copyright © 1990 W. N. Venables
+Copyright © 1992 W. N. Venables & D. M. Smith
+Copyright © 1997 R. Gentleman & R. Ihaka
+Copyright © 1997, 1998 M. Maechler
+Copyright © 1999–2015 R Core Team +

+
+

Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. +

+

Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided that the +entire resulting derived work is distributed under the terms of a +permission notice identical to this one. +

+

Permission is granted to copy and distribute translations of this manual +into another language, under the above conditions for modified versions, +except that this permission notice may be stated in a translation +approved by the R Core Team. +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Preface

+ +

This introduction to R is derived from an original set of notes +describing the S and S-PLUS environments written in 1990–2 by +Bill Venables and David M. Smith when at the University of Adelaide. We +have made a number of small changes to reflect differences between the +R and S programs, and expanded some of the material. +

+

We would like to extend warm thanks to Bill Venables (and David Smith) +for granting permission to distribute this modified version of the notes +in this way, and for being a supporter of R from way back. +

+

Comments and corrections are always welcome. Please address email +correspondence to R-core@R-project.org. +

+ +

Suggestions to the reader

+ +

Most R novices will start with the introductory session in Appendix +A. This should give some familiarity with the style of R sessions +and more importantly some instant feedback on what actually happens. +

+

Many users will come to R mainly for its graphical facilities. +See Graphics, which can be read at almost any time and need not wait +until all the preceding sections have been digested. +

+ + + + +
+ + + +

1 Introduction and preliminaries

+ + + + + + + + + + + + + + +
+ + + +

1.1 The R environment

+ +

R is an integrated suite of software facilities for data +manipulation, calculation and graphical display. Among other things it +has +

+
    +
  • an effective data handling and storage facility, +
  • a suite of operators for calculations on arrays, in particular matrices, +
  • a large, coherent, integrated collection of intermediate tools for data +analysis, +
  • graphical facilities for data analysis and display either directly at +the computer or on hardcopy, and +
  • a well developed, simple and effective programming language (called ‘S’) +which includes conditionals, loops, user defined recursive functions and +input and output facilities. (Indeed most of the system supplied +functions are themselves written in the S language.) +
+ +

The term “environment” is intended to characterize it as a fully +planned and coherent system, rather than an incremental accretion of +very specific and inflexible tools, as is frequently the case with other +data analysis software. +

+

R is very much a vehicle for newly developing methods of interactive +data analysis. It has developed rapidly, and has been extended by a +large collection of packages. However, most programs written in +R are essentially ephemeral, written for a single piece of data +analysis. +

+
+ + + +

1.2 Related software and documentation

+ +

R can be regarded as an implementation of the S language which +was developed at Bell Laboratories by Rick Becker, John Chambers and +Allan Wilks, and also forms the basis of the S-PLUS systems. +

+

The evolution of the S language is characterized by four books by +John Chambers and coauthors. For R, the basic reference is The +New S Language: A Programming Environment for Data Analysis and +Graphics by Richard A. Becker, John M. Chambers and Allan R. +Wilks. The new features of the 1991 release of S +are covered in Statistical Models in S edited by John M. +Chambers and Trevor J. Hastie. The formal methods and classes of the +methods package are based on those described in Programming +with Data by John M. Chambers. See References, for precise +references. +

+

There are now a number of books which describe how to use R for data +analysis and statistics, and documentation for S/S-PLUS can +typically be used with R, keeping the differences between the S +implementations in mind. See What documentation exists for R? in The R statistical system FAQ. +

+
+ + + +

1.3 R and statistics

+ + +

Our introduction to the R environment did not mention +statistics, yet many people use R as a statistics system. We +prefer to think of it of an environment within which many classical and +modern statistical techniques have been implemented. A few of these are +built into the base R environment, but many are supplied as +packages. There are about 25 packages supplied with R (called +“standard” and “recommended” packages) and many more are available +through the CRAN family of Internet sites (via +https://CRAN.R-project.org) and elsewhere. More details on +packages are given later (see Packages). +

+

Most classical statistics and much of the latest methodology is +available for use with R, but users may need to be prepared to do a +little work to find it. +

+

There is an important difference in philosophy between S (and hence +R) and the other main statistical systems. In S a statistical +analysis is normally done as a series of steps, with intermediate +results being stored in objects. Thus whereas SAS and SPSS will give +copious output from a regression or discriminant analysis, R will +give minimal output and store the results in a fit object for subsequent +interrogation by further R functions. +

+
+ + + +

1.4 R and the window system

+ +

The most convenient way to use R is at a graphics workstation running +a windowing system. This guide is aimed at users who have this +facility. In particular we will occasionally refer to the use of R +on an X window system although the vast bulk of what is said applies +generally to any implementation of the R environment. +

+

Most users will find it necessary to interact directly with the +operating system on their computer from time to time. In this guide, we +mainly discuss interaction with the operating system on UNIX machines. +If you are running R under Windows or OS X you will need to make +some small adjustments. +

+

Setting up a workstation to take full advantage of the customizable +features of R is a straightforward if somewhat tedious procedure, and +will not be considered further here. Users in difficulty should seek +local expert help. +

+
+ + + +

1.5 Using R interactively

+ +

When you use the R program it issues a prompt when it expects input +commands. The default prompt is ‘>’, which on UNIX might be +the same as the shell prompt, and so it may appear that nothing is +happening. However, as we shall see, it is easy to change to a +different R prompt if you wish. We will assume that the UNIX shell +prompt is ‘$’. +

+

In using R under UNIX the suggested procedure for the first occasion +is as follows: +

+
    +
  1. Create a separate sub-directory, say work, to hold data files on +which you will use R for this problem. This will be the working +directory whenever you use R for this particular problem. + +
    +
    $ mkdir work
    +$ cd work
    +
    + +
  2. Start the R program with the command + +
    +
    $ R
    +
    + +
  3. At this point R commands may be issued (see later). + +
  4. To quit the R program the command is + +
    +
    > q()
    +
    + +

    At this point you will be asked whether you want to save the data from +your R session. On some systems this will bring up a dialog box, and +on others you will receive a text prompt to which you can respond +yes, no or cancel (a single letter abbreviation will +do) to save the data before quitting, quit without saving, or return to +the R session. Data which is saved will be available in future R +sessions. +

    +
+ +

Further R sessions are simple. +

+
    +
  1. Make work the working directory and start the program as before: + +
    +
    $ cd work
    +$ R
    +
    + +
  2. Use the R program, terminating with the q() command at the end +of the session. + +
+ +

To use R under Windows the procedure to +follow is basically the same. Create a folder as the working directory, +and set that in the Start In field in your R shortcut. +Then launch R by double clicking on the icon. +

+ +

1.6 An introductory session

+ +

Readers wishing to get a feel for R at a computer before proceeding +are strongly advised to work through the introductory session +given in A sample session. +

+
+ + + +

1.7 Getting help with functions and features

+ + +

R has an inbuilt help facility similar to the man facility of +UNIX. To get more information on any specific named function, for +example solve, the command is +

+
+
> help(solve)
+
+ + +

An alternative is +

+
+
> ?solve
+
+ + +

For a feature specified by special characters, the argument must be +enclosed in double or single quotes, making it a “character string”: +This is also necessary for a few words with syntactic meaning including +if, for and function. +

+
+
> help("[[")
+
+ +

Either form of quote mark may be used to escape the other, as in the +string "It's important". Our convention is to use +double quote marks for preference. +

+

On most R installations help is available in HTML format by +running +

+
+
> help.start()
+
+ + +

which will launch a Web browser that allows the help pages to be browsed +with hyperlinks. On UNIX, subsequent help requests are sent to the +HTML-based help system. The ‘Search Engine and Keywords’ link in the +page loaded by help.start() is particularly useful as it is +contains a high-level concept list which searches though available +functions. It can be a great way to get your bearings quickly and to +understand the breadth of what R has to offer. +

+ +

The help.search command (alternatively ??) +allows searching for help in various +ways. For example, +

+
+
> ??solve
+
+ + +

Try ?help.search for details and more examples. +

+

The examples on a help topic can normally be run by +

+
+
> example(topic)
+
+ + +

Windows versions of R have other optional help systems: use +

+
+
> ?help
+
+ +

for further details. +

+
+ + + +

1.8 R commands, case sensitivity, etc.

+ +

Technically R is an expression language with a very simple +syntax. It is case sensitive as are most UNIX based packages, so +A and a are different symbols and would refer to different +variables. The set of symbols which can be used in R names depends +on the operating system and country within which R is being run +(technically on the locale in use). Normally all alphanumeric +symbols are allowed2 (and in +some countries this includes accented letters) plus ‘.’ and +‘_’, with the restriction that a name must start with +‘.’ or a letter, and if it starts with ‘.’ the +second character must not be a digit. Names are effectively +unlimited in length. +

+

Elementary commands consist of either expressions or +assignments. If an expression is given as a command, it is +evaluated, printed (unless specifically made invisible), and the value +is lost. An assignment also evaluates an expression and passes the +value to a variable but the result is not automatically printed. +

+

Commands are separated either by a semi-colon (‘;’), or by a +newline. Elementary commands can be grouped together into one compound +expression by braces (‘{’ and ‘}’). +Comments can be put almost3 anywhere, +starting with a hashmark (‘#’), everything to the end of the +line is a comment. +

+

If a command is not complete at the end of a line, R will +give a different prompt, by default +

+
+
+
+
+ +

on second and subsequent lines and continue to read input until the +command is syntactically complete. This prompt may be changed by the +user. We will generally omit the continuation prompt +and indicate continuation by simple indenting. +

+

Command lines entered at the console are limited4 to about 4095 bytes (not characters). +

+
+ + + +

1.9 Recall and correction of previous commands

+ +

Under many versions of UNIX and on Windows, R provides a mechanism +for recalling and re-executing previous commands. The vertical arrow +keys on the keyboard can be used to scroll forward and backward through +a command history. Once a command is located in this way, the +cursor can be moved within the command using the horizontal arrow keys, +and characters can be removed with the DEL key or added with the +other keys. More details are provided later: see The command-line editor. +

+

The recall and editing capabilities under UNIX are highly customizable. +You can find out how to do this by reading the manual entry for the +readline library. +

+

Alternatively, the Emacs text editor provides more general support +mechanisms (via ESS, Emacs Speaks Statistics) for +working interactively with R. See R and Emacs in The R +statistical system FAQ. +

+
+ + + +

1.10 Executing commands from or diverting output to a file

+ + +

If commands5 are stored in an external +file, say commands.R in the working directory work, they +may be executed at any time in an R session with the command +

+
+
> source("commands.R")
+
+ + +

For Windows Source is also available on the +File menu. The function sink, +

+
+
> sink("record.lis")
+
+ + +

will divert all subsequent output from the console to an external file, +record.lis. The command +

+
+
> sink()
+
+ +

restores it to the console once again. +

+
+ + + +

1.11 Data permanency and removing objects

+ +

The entities that R creates and manipulates are known as +objects. These may be variables, arrays of numbers, character +strings, functions, or more general structures built from such +components. +

+

During an R session, objects are created and stored by name (we +discuss this process in the next session). The R command +

+
+
> objects()
+
+ +

(alternatively, ls()) can be used to display the names of (most +of) the objects which are currently stored within R. The collection +of objects currently stored is called the workspace. + +

+

To remove objects the function rm is available: +

+
+
> rm(x, y, z, ink, junk, temp, foo, bar)
+
+ + + +

All objects created during an R session can be stored permanently in +a file for use in future R sessions. At the end of each R session +you are given the opportunity to save all the currently available +objects. If you indicate that you want to do this, the objects are +written to a file called .RData6 in the +current directory, and the command lines used in the session are saved +to a file called .Rhistory. +

+

When R is started at later time from the same directory it reloads +the workspace from this file. At the same time the associated commands +history is reloaded. +

+

It is recommended that you should use separate working directories for +analyses conducted with R. It is quite common for objects with names +x and y to be created during an analysis. Names like this +are often meaningful in the context of a single analysis, but it can be +quite hard to decide what they might be when the several analyses have +been conducted in the same directory. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

2 Simple manipulations; numbers and vectors

+ + + + + + + + + + + + + +
+ + + +

2.1 Vectors and assignment

+ +

R operates on named data structures. The simplest such +structure is the numeric vector, which is a single entity +consisting of an ordered collection of numbers. To set up a vector +named x, say, consisting of five numbers, namely 10.4, 5.6, 3.1, +6.4 and 21.7, use the R command +

+
+
> x <- c(10.4, 5.6, 3.1, 6.4, 21.7)
+
+ + + +

This is an assignment statement using the function +c() which in this context can take an arbitrary number of vector +arguments and whose value is a vector got by concatenating its +arguments end to end.7 +

+

A number occurring by itself in an expression is taken as a vector of +length one. +

+

Notice that the assignment operator (‘<-’), which consists +of the two characters ‘<’ (“less than”) and +‘-’ (“minus”) occurring strictly side-by-side and it +‘points’ to the object receiving the value of the expression. +In most contexts the ‘=’ operator can be used as an alternative. + +

+

Assignment can also be made using the function assign(). An +equivalent way of making the same assignment as above is with: +

+
+
> assign("x", c(10.4, 5.6, 3.1, 6.4, 21.7))
+
+ +

The usual operator, <-, can be thought of as a syntactic +short-cut to this. +

+

Assignments can also be made in the other direction, using the obvious +change in the assignment operator. So the same assignment could be made +using +

+
+
> c(10.4, 5.6, 3.1, 6.4, 21.7) -> x
+
+ +

If an expression is used as a complete command, the value is printed +and lost8. So now if we +were to use the command +

+
+
> 1/x
+
+ +

the reciprocals of the five values would be printed at the terminal (and +the value of x, of course, unchanged). +

+

The further assignment +

+
+
> y <- c(x, 0, x)
+
+ +

would create a vector y with 11 entries consisting of two copies +of x with a zero in the middle place. +

+
+ + + +

2.2 Vector arithmetic

+ +

Vectors can be used in arithmetic expressions, in which case the +operations are performed element by element. Vectors occurring in the +same expression need not all be of the same length. If they are not, +the value of the expression is a vector with the same length as the +longest vector which occurs in the expression. Shorter vectors in the +expression are recycled as often as need be (perhaps +fractionally) until they match the length of the longest vector. In +particular a constant is simply repeated. So with the above assignments +the command + +

+
+
> v <- 2*x + y + 1
+
+ +

generates a new vector v of length 11 constructed by adding +together, element by element, 2*x repeated 2.2 times, y +repeated just once, and 1 repeated 11 times. +

+ +

The elementary arithmetic operators are the usual +, -, +*, / and ^ for raising to a power. + + + + + +In addition all of the common arithmetic functions are available. +log, exp, sin, cos, tan, sqrt, +and so on, all have their usual meaning. + + + + + + +max and min select the largest and smallest elements of a +vector respectively. + + +range is a function whose value is a vector of length two, namely +c(min(x), max(x)). + +length(x) is the number of elements in x, + +sum(x) gives the total of the elements in x, + +and prod(x) their product. + +

+

Two statistical functions are mean(x) which calculates the sample +mean, which is the same as sum(x)/length(x), + +and var(x) which gives +

+
+
sum((x-mean(x))^2)/(length(x)-1)
+
+ + +

or sample variance. If the argument to var() is an +n-by-p matrix the value is a p-by-p sample +covariance matrix got by regarding the rows as independent +p-variate sample vectors. +

+

sort(x) returns a vector of the same size as x with the +elements arranged in increasing order; however there are other more +flexible sorting facilities available (see order() or +sort.list() which produce a permutation to do the sorting). + + +

+

Note that max and min select the largest and smallest +values in their arguments, even if they are given several vectors. The +parallel maximum and minimum functions pmax and +pmin return a vector (of length equal to their longest argument) +that contains in each element the largest (smallest) element in that +position in any of the input vectors. + + +

+

For most purposes the user will not be concerned if the “numbers” in a +numeric vector are integers, reals or even complex. Internally +calculations are done as double precision real numbers, or double +precision complex numbers if the input data are complex. +

+

To work with complex numbers, supply an explicit complex part. Thus +

+
+
sqrt(-17)
+
+ +

will give NaN and a warning, but +

+
+
sqrt(-17+0i)
+
+ +

will do the computations as complex numbers. +

+ + + + +
+ + + +

2.3 Generating regular sequences

+ + +

R has a number of facilities for generating commonly used sequences +of numbers. For example 1:30 is the vector c(1, 2, +…, 29, 30). + +The colon operator has high priority within an expression, so, for +example 2*1:15 is the vector c(2, 4, …, 28, 30). +Put n <- 10 and compare the sequences 1:n-1 and +1:(n-1). +

+

The construction 30:1 may be used to generate a sequence +backwards. +

+ +

The function seq() is a more general facility for generating +sequences. It has five arguments, only some of which may be specified +in any one call. The first two arguments, if given, specify the +beginning and end of the sequence, and if these are the only two +arguments given the result is the same as the colon operator. That is +seq(2,10) is the same vector as 2:10. +

+

Arguments to seq(), and to many other R functions, can also +be given in named form, in which case the order in which they appear is +irrelevant. The first two arguments may be named +from=value and to=value; thus +seq(1,30), seq(from=1, to=30) and seq(to=30, +from=1) are all the same as 1:30. The next two arguments to +seq() may be named by=value and +length=value, which specify a step size and a length for +the sequence respectively. If neither of these is given, the default +by=1 is assumed. +

+

For example +

+
+
> seq(-5, 5, by=.2) -> s3
+
+ +

generates in s3 the vector c(-5.0, -4.8, -4.6, …, +4.6, 4.8, 5.0). Similarly +

+
+
> s4 <- seq(length=51, from=-5, by=.2)
+
+ +

generates the same vector in s4. +

+

The fifth argument may be named along=vector, which is +normally used as the only argument to create the sequence 1, 2, +…, length(vector), or the empty sequence if the vector is +empty (as it can be). +

+

A related function is rep() + +which can be used for replicating an object in various complicated ways. +The simplest form is +

+
+
> s5 <- rep(x, times=5)
+
+ +

which will put five copies of x end-to-end in s5. Another +useful version is +

+
+
> s6 <- rep(x, each=5)
+
+ +

which repeats each element of x five times before moving on to +the next. +

+
+ + + +

2.4 Logical vectors

+ +

As well as numerical vectors, R allows manipulation of logical +quantities. The elements of a logical vector can have the values +TRUE, FALSE, and NA (for “not available”, see +below). The first two are often abbreviated as T and F, +respectively. Note however that T and F are just +variables which are set to TRUE and FALSE by default, but +are not reserved words and hence can be overwritten by the user. Hence, +you should always use TRUE and FALSE. + + + + +

+

Logical vectors are generated by conditions. For example +

+
+
> temp <- x > 13
+
+ +

sets temp as a vector of the same length as x with values +FALSE corresponding to elements of x where the condition +is not met and TRUE where it is. +

+

The logical operators are <, <=, >, >=, +== for exact equality and != for inequality. + + + + + + +In addition if c1 and c2 are logical expressions, then +c1 & c2 is their intersection (“and”), c1 | c2 +is their union (“or”), and !c1 is the negation of +c1. + + + +

+

Logical vectors may be used in ordinary arithmetic, in which case they +are coerced into numeric vectors, FALSE becoming 0 +and TRUE becoming 1. However there are situations where +logical vectors and their coerced numeric counterparts are not +equivalent, for example see the next subsection. +

+
+ + + +

2.5 Missing values

+ + +

In some cases the components of a vector may not be completely +known. When an element or value is “not available” or a “missing +value” in the statistical sense, a place within a vector may be +reserved for it by assigning it the special value NA. + +In general any operation on an NA becomes an NA. The +motivation for this rule is simply that if the specification of an +operation is incomplete, the result cannot be known and hence is not +available. +

+ +

The function is.na(x) gives a logical vector of the same size as +x with value TRUE if and only if the corresponding element +in x is NA. +

+
+
> z <- c(1:3,NA);  ind <- is.na(z)
+
+ +

Notice that the logical expression x == NA is quite different +from is.na(x) since NA is not really a value but a marker +for a quantity that is not available. Thus x == NA is a vector +of the same length as x all of whose values are NA +as the logical expression itself is incomplete and hence undecidable. +

+

Note that there is a second kind of “missing” values which are +produced by numerical computation, the so-called Not a Number, +NaN, + +values. Examples are +

+
+
> 0/0
+
+ +

or +

+
+
> Inf - Inf
+
+ +

which both give NaN since the result cannot be defined sensibly. +

+

In summary, is.na(xx) is TRUE both for NA +and NaN values. To differentiate these, is.nan(xx) is only +TRUE for NaNs. + +

+

Missing values are sometimes printed as <NA> when character +vectors are printed without quotes. +

+
+ + + +

2.6 Character vectors

+ + +

Character quantities and character vectors are used frequently in R, +for example as plot labels. Where needed they are denoted by a sequence +of characters delimited by the double quote character, e.g., +"x-values", "New iteration results". +

+

Character strings are entered using either matching double (") or +single (') quotes, but are printed using double quotes (or +sometimes without quotes). They use C-style escape sequences, using +\ as the escape character, so \\ is entered and printed as +\\, and inside double quotes " is entered as \". +Other useful escape sequences are \n, newline, \t, tab and +\b, backspace—see ?Quotes for a full list. +

+

Character vectors may be concatenated into a vector by the c() +function; examples of their use will emerge frequently. + +

+ +

The paste() function takes an arbitrary number of arguments and +concatenates them one by one into character strings. Any numbers given +among the arguments are coerced into character strings in the evident +way, that is, in the same way they would be if they were printed. The +arguments are by default separated in the result by a single blank +character, but this can be changed by the named argument, +sep=string, which changes it to string, +possibly empty. +

+

For example +

+
+
> labs <- paste(c("X","Y"), 1:10, sep="")
+
+ +

makes labs into the character vector +

+
+
c("X1", "Y2", "X3", "Y4", "X5", "Y6", "X7", "Y8", "X9", "Y10")
+
+ +

Note particularly that recycling of short lists takes place here too; +thus c("X", "Y") is repeated 5 times to match the sequence +1:10. +9 +

+
+ + + +

2.7 Index vectors; selecting and modifying subsets of a data set

+ + +

Subsets of the elements of a vector may be selected by appending to the +name of the vector an index vector in square brackets. More +generally any expression that evaluates to a vector may have subsets of +its elements similarly selected by appending an index vector in square +brackets immediately after the expression. +

+ +

Such index vectors can be any of four distinct types. +

+
    +
  1. A logical vector. In this case the index vector is recycled to the +same length as the vector from which elements are to be selected. +Values corresponding to TRUE in the index vector are selected and +those corresponding to FALSE are omitted. For example + +
    +
    > y <- x[!is.na(x)]
    +
    + +

    creates (or re-creates) an object y which will contain the +non-missing values of x, in the same order. Note that if +x has missing values, y will be shorter than x. +Also +

    +
    +
    > (x+1)[(!is.na(x)) & x>0] -> z
    +
    + +

    creates an object z and places in it the values of the vector +x+1 for which the corresponding value in x was both +non-missing and positive. +

    +
  2. A vector of positive integral quantities. In this case the +values in the index vector must lie in the set {1, 2, …, +length(x)}. The corresponding elements of the vector are +selected and concatenated, in that order, in the result. The +index vector can be of any length and the result is of the same length +as the index vector. For example x[6] is the sixth component of +x and + +
    +
    > x[1:10]
    +
    + +

    selects the first 10 elements of x (assuming length(x) is +not less than 10). Also +

    +
    +
    > c("x","y")[rep(c(1,2,2,1), times=4)]
    +
    + +

    (an admittedly unlikely thing to do) produces a character vector of +length 16 consisting of "x", "y", "y", "x" repeated four times. +

    +
  3. A vector of negative integral quantities. Such an index vector +specifies the values to be excluded rather than included. Thus + +
    +
    > y <- x[-(1:5)]
    +
    + +

    gives y all but the first five elements of x. +

    +
  4. A vector of character strings. This possibility only applies +where an object has a names attribute to identify its components. +In this case a sub-vector of the names vector may be used in the same way +as the positive integral labels in item 2 further above. + +
    +
    > fruit <- c(5, 10, 1, 20)
    +> names(fruit) <- c("orange", "banana", "apple", "peach")
    +> lunch <- fruit[c("apple","orange")]
    +
    + +

    The advantage is that alphanumeric names are often easier to +remember than numeric indices. This option is particularly +useful in connection with data frames, as we shall see later. +

    +
+ +

An indexed expression can also appear on the receiving end of an +assignment, in which case the assignment operation is performed +only on those elements of the vector. The expression must be of +the form vector[index_vector] as having an arbitrary +expression in place of the vector name does not make much sense here. +

+

For example +

+
+
> x[is.na(x)] <- 0
+
+ +

replaces any missing values in x by zeros and +

+
+
> y[y < 0] <- -y[y < 0]
+
+ +

has the same effect as +

+
+
> y <- abs(y)
+
+ +
+ + + +

2.8 Other types of objects

+ +

Vectors are the most important type of object in R, but there are +several others which we will meet more formally in later sections. +

+
    +
  • matrices or more generally arrays are multi-dimensional +generalizations of vectors. In fact, they are vectors that can +be indexed by two or more indices and will be printed in special ways. +See Arrays and matrices. + +
  • factors provide compact ways to handle categorical data. +See Factors. + +
  • lists are a general form of vector in which the various elements +need not be of the same type, and are often themselves vectors or lists. +Lists provide a convenient way to return the results of a statistical +computation. See Lists. + +
  • data frames are matrix-like structures, in which the columns can +be of different types. Think of data frames as ‘data matrices’ with one +row per observational unit but with (possibly) both numerical and +categorical variables. Many experiments are best described by data +frames: the treatments are categorical but the response is numeric. +See Data frames. + +
  • functions are themselves objects in R which can be stored in +the project’s workspace. This provides a simple and convenient way to +extend R. See Writing your own functions. + +
+ +
+ + + +

3 Objects, their modes and attributes

+ + + + + + + + + + + +
+ + + +

3.1 Intrinsic attributes: mode and length

+ +

The entities R operates on are technically known as objects. +Examples are vectors of numeric (real) or complex values, vectors of +logical values and vectors of character strings. These are known as +“atomic” structures since their components are all of the same type, +or mode, namely numeric10, complex, +logical, character and raw. +

+

Vectors must have their values all of the same mode. Thus any +given vector must be unambiguously either logical, +numeric, complex, character or raw. (The +only apparent exception to this rule is the special “value” listed as +NA for quantities not available, but in fact there are several +types of NA). Note that a vector can be empty and still have a +mode. For example the empty character string vector is listed as +character(0) and the empty numeric vector as numeric(0). +

+

R also operates on objects called lists, which are of mode +list. These are ordered sequences of objects which individually +can be of any mode. lists are known as “recursive” rather than +atomic structures since their components can themselves be lists in +their own right. +

+

The other recursive structures are those of mode function and +expression. Functions are the objects that form part of the R +system along with similar user written functions, which we discuss in +some detail later. Expressions as objects form an +advanced part of R which will not be discussed in this guide, except +indirectly when we discuss formulae used with modeling in R. +

+

By the mode of an object we mean the basic type of its +fundamental constituents. This is a special case of a “property” +of an object. Another property of every object is its length. The +functions mode(object) and length(object) can be +used to find out the mode and length of any defined structure +11. +

+

Further properties of an object are usually provided by +attributes(object), see Getting and setting attributes. +Because of this, mode and length are also called “intrinsic +attributes” of an object. + + +

+

For example, if z is a complex vector of length 100, then in an +expression mode(z) is the character string "complex" and +length(z) is 100. +

+

R caters for changes of mode almost anywhere it could be considered +sensible to do so, (and a few where it might not be). For example with +

+
+
> z <- 0:9
+
+ +

we could put +

+
+
> digits <- as.character(z)
+
+ +

after which digits is the character vector c("0", "1", "2", +…, "9"). A further coercion, or change of mode, +reconstructs the numerical vector again: +

+
+
> d <- as.integer(digits)
+
+ +

Now d and z are the same.12 There is a +large collection of functions of the form as.something() +for either coercion from one mode to another, or for investing an object +with some other attribute it may not already possess. The reader should +consult the different help files to become familiar with them. +

+ +
+ + + +

3.2 Changing the length of an object

+ +

An “empty” object may still have a mode. For example +

+
+
> e <- numeric()
+
+ +

makes e an empty vector structure of mode numeric. Similarly +character() is a empty character vector, and so on. Once an +object of any size has been created, new components may be added to it +simply by giving it an index value outside its previous range. Thus +

+
+
> e[3] <- 17
+
+ +

now makes e a vector of length 3, (the first two components of +which are at this point both NA). This applies to any structure +at all, provided the mode of the additional component(s) agrees with the +mode of the object in the first place. +

+

This automatic adjustment of lengths of an object is used often, for +example in the scan() function for input. (see The scan() function.) +

+

Conversely to truncate the size of an object requires only an assignment +to do so. Hence if alpha is an object of length 10, then +

+
+
> alpha <- alpha[2 * 1:5]
+
+ +

makes it an object of length 5 consisting of just the former components +with even index. (The old indices are not retained, of course.) We can +then retain just the first three values by +

+
+
> length(alpha) <- 3
+
+ +

and vectors can be extended (by missing values) in the same way. +

+
+ + + +

3.3 Getting and setting attributes

+ + + +

The function attributes(object) + +returns a list of all the non-intrinsic attributes currently defined for +that object. The function attr(object, name) + +can be used to select a specific attribute. These functions are rarely +used, except in rather special circumstances when some new attribute is +being created for some particular purpose, for example to associate a +creation date or an operator with an R object. The concept, however, +is very important. +

+

Some care should be exercised when assigning or deleting attributes +since they are an integral part of the object system used in R. +

+

When it is used on the left hand side of an assignment it can be used +either to associate a new attribute with object or to +change an existing one. For example +

+
+
> attr(z, "dim") <- c(10,10)
+
+ +

allows R to treat z as if it were a 10-by-10 matrix. +

+
+ + + +

3.4 The class of an object

+ + +

All objects in R have a class, reported by the function +class. For simple vectors this is just the mode, for example +"numeric", "logical", "character" or "list", +but "matrix", "array", "factor" and +"data.frame" are other possible values. +

+

A special attribute known as the class of the object is used to +allow for an object-oriented style13 of +programming in R. For example if an object has class +"data.frame", it will be printed in a certain way, the +plot() function will display it graphically in a certain way, and +other so-called generic functions such as summary() will react to +it as an argument in a way sensitive to its class. +

+

To remove temporarily the effects of class, use the function +unclass(). + +For example if winter has the class "data.frame" then +

+
+
> winter
+
+ +

will print it in data frame form, which is rather like a matrix, whereas +

+
+
> unclass(winter)
+
+ +

will print it as an ordinary list. Only in rather special situations do +you need to use this facility, but one is when you are learning to come +to terms with the idea of class and generic functions. +

+

Generic functions and classes will be discussed further in Object orientation, but only briefly. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

4 Ordered and unordered factors

+ + + +

A factor is a vector object used to specify a discrete +classification (grouping) of the components of other vectors of the same length. +R provides both ordered and unordered factors. +While the “real” application of factors is with model formulae +(see Contrasts), we here look at a specific example. +

+ +

4.1 A specific example

+ +

Suppose, for example, we have a sample of 30 tax accountants from all +the states and territories of Australia14 +and their individual state of origin is specified by a character vector +of state mnemonics as +

+
+
> state <- c("tas", "sa",  "qld", "nsw", "nsw", "nt",  "wa",  "wa",
+             "qld", "vic", "nsw", "vic", "qld", "qld", "sa",  "tas",
+             "sa",  "nt",  "wa",  "vic", "qld", "nsw", "nsw", "wa",
+             "sa",  "act", "nsw", "vic", "vic", "act")
+
+ +

Notice that in the case of a character vector, “sorted” means sorted +in alphabetical order. +

+

A factor is similarly created using the factor() function: + +

+
+
> statef <- factor(state)
+
+ +

The print() function handles factors slightly differently from +other objects: +

+
+
> statef
+ [1] tas sa  qld nsw nsw nt  wa  wa  qld vic nsw vic qld qld sa
+[16] tas sa  nt  wa  vic qld nsw nsw wa  sa  act nsw vic vic act
+Levels:  act nsw nt qld sa tas vic wa
+
+ +

To find out the levels of a factor the function levels() can be +used. + +

+
+
> levels(statef)
+[1] "act" "nsw" "nt"  "qld" "sa"  "tas" "vic" "wa"
+
+ + + + + + +
+ +
+

+Next: , Previous: , Up: Factors   [Contents][Index]

+
+ +

4.2 The function tapply() and ragged arrays

+ + +

To continue the previous example, suppose we have the incomes of the +same tax accountants in another vector (in suitably large units of +money) +

+
+
> incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
+               61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
+               59, 46, 58, 43)
+
+ +

To calculate the sample mean income for each state we can now use the +special function tapply(): +

+
+
> incmeans <- tapply(incomes, statef, mean)
+
+ +

giving a means vector with the components labelled by the levels +

+
+
   act    nsw     nt    qld     sa    tas    vic     wa
+44.500 57.333 55.500 53.600 55.000 60.500 56.000 52.250
+
+ +

The function tapply() is used to apply a function, here +mean(), to each group of components of the first argument, here +incomes, defined by the levels of the second component, here +statef15, as if they were separate vector +structures. The result is a structure of the same length as the levels +attribute of the factor containing the results. The reader should +consult the help document for more details. +

+

Suppose further we needed to calculate the standard errors of the state +income means. To do this we need to write an R function to calculate +the standard error for any given vector. Since there is an builtin +function var() to calculate the sample variance, such a function +is a very simple one liner, specified by the assignment: +

+
+
> stderr <- function(x) sqrt(var(x)/length(x))
+
+ +

(Writing functions will be considered later in Writing your own functions, and in this case was unnecessary as R also has a builtin +function sd().) + + +After this assignment, the standard errors are calculated by +

+
+
> incster <- tapply(incomes, statef, stderr)
+
+ +

and the values calculated are then +

+
+
> incster
+act    nsw  nt    qld     sa tas   vic     wa
+1.5 4.3102 4.5 4.1061 2.7386 0.5 5.244 2.6575
+
+ +

As an exercise you may care to find the usual 95% confidence limits for +the state mean incomes. To do this you could use tapply() once +more with the length() function to find the sample sizes, and the +qt() function to find the percentage points of the appropriate +t-distributions. (You could also investigate R’s facilities +for t-tests.) +

+

The function tapply() can also be used to handle more complicated +indexing of a vector by multiple categories. For example, we might wish +to split the tax accountants by both state and sex. However in this +simple instance (just one factor) what happens can be thought of as +follows. The values in the vector are collected into groups +corresponding to the distinct entries in the factor. The function is +then applied to each of these groups individually. The value is a +vector of function results, labelled by the levels attribute of +the factor. +

+

The combination of a vector and a labelling factor is an example of what +is sometimes called a ragged array, since the subclass sizes are +possibly irregular. When the subclass sizes are all the same the +indexing may be done implicitly and much more efficiently, as we see in +the next section. +

+ +
+ + + +

4.3 Ordered factors

+ + +

The levels of factors are stored in alphabetical order, or in the order +they were specified to factor if they were specified explicitly. +

+

Sometimes the levels will have a natural ordering that we want to record +and want our statistical analysis to make use of. The ordered() + +function creates such ordered factors but is otherwise identical to +factor. For most purposes the only difference between ordered +and unordered factors is that the former are printed showing the +ordering of the levels, but the contrasts generated for them in fitting +linear models are different. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

5 Arrays and matrices

+ + + + + + + + + + + + + + +
+ + + +

5.1 Arrays

+ + + +

An array can be considered as a multiply subscripted collection of data +entries, for example numeric. R allows simple facilities for +creating and handling arrays, and in particular the special case of +matrices. +

+

A dimension vector is a vector of non-negative integers. If its length is +k then the array is k-dimensional, e.g. a matrix is a +2-dimensional array. The dimensions are indexed from one up to +the values given in the dimension vector. +

+

A vector can be used by R as an array only if it has a dimension +vector as its dim attribute. Suppose, for example, z is a +vector of 1500 elements. The assignment +

+
+
> dim(z) <- c(3,5,100)
+
+ + +

gives it the dim attribute that allows it to be treated as a +3 by 5 by 100 array. +

+

Other functions such as matrix() and array() are available +for simpler and more natural looking assignments, as we shall see in +The array() function. +

+

The values in the data vector give the values in the array in the same +order as they would occur in FORTRAN, that is “column major order,” +with the first subscript moving fastest and the last subscript slowest. +

+

For example if the dimension vector for an array, say a, is +c(3,4,2) then there are 3 * 4 * 2 += 24 entries in a and the data vector holds them in the order +a[1,1,1], a[2,1,1], …, a[2,4,2], a[3,4,2]. +

+

Arrays can be one-dimensional: such arrays are usually treated in the +same way as vectors (including when printing), but the exceptions can +cause confusion. +

+
+ +
+

+Next: , Previous: , Up: Arrays and matrices   [Contents][Index]

+
+ +

5.2 Array indexing. Subsections of an array

+ + +

Individual elements of an array may be referenced by giving the name of +the array followed by the subscripts in square brackets, separated by +commas. +

+

More generally, subsections of an array may be specified by giving a +sequence of index vectors in place of subscripts; however +if any index position is given an empty index vector, then the +full range of that subscript is taken. +

+

Continuing the previous example, a[2,,] is a 4 * +2 array with dimension vector c(4,2) and data vector containing +the values +

+
+
c(a[2,1,1], a[2,2,1], a[2,3,1], a[2,4,1],
+  a[2,1,2], a[2,2,2], a[2,3,2], a[2,4,2])
+
+ +

in that order. a[,,] stands for the entire array, which is the +same as omitting the subscripts entirely and using a alone. +

+

For any array, say Z, the dimension vector may be referenced +explicitly as dim(Z) (on either side of an assignment). +

+

Also, if an array name is given with just one subscript or index +vector, then the corresponding values of the data vector only are used; +in this case the dimension vector is ignored. This is not the case, +however, if the single index is not a vector but itself an array, as we +next discuss. +

+ + + + + +
+ + + +

5.3 Index matrices

+ +

As well as an index vector in any subscript position, a matrix may be +used with a single index matrix in order either to assign a vector +of quantities to an irregular collection of elements in the array, or to +extract an irregular collection as a vector. +

+

A matrix example makes the process clear. In the case of a doubly +indexed array, an index matrix may be given consisting of two columns +and as many rows as desired. The entries in the index matrix are the +row and column indices for the doubly indexed array. Suppose for +example we have a 4 by 5 array X and we wish to do +the following: +

+
    +
  • Extract elements X[1,3], X[2,2] and X[3,1] as a +vector structure, and +
  • Replace these entries in the array X by zeroes. +
+

In this case we need a 3 by 2 subscript array, as in the +following example. +

+
+
> x <- array(1:20, dim=c(4,5))   # Generate a 4 by 5 array.
+> x
+     [,1] [,2] [,3] [,4] [,5]
+[1,]    1    5    9   13   17
+[2,]    2    6   10   14   18
+[3,]    3    7   11   15   19
+[4,]    4    8   12   16   20
+> i <- array(c(1:3,3:1), dim=c(3,2))
+> i                             # i is a 3 by 2 index array.
+     [,1] [,2]
+[1,]    1    3
+[2,]    2    2
+[3,]    3    1
+> x[i]                          # Extract those elements
+[1] 9 6 3
+> x[i] <- 0                     # Replace those elements by zeros.
+> x
+     [,1] [,2] [,3] [,4] [,5]
+[1,]    1    5    0   13   17
+[2,]    2    0   10   14   18
+[3,]    0    7   11   15   19
+[4,]    4    8   12   16   20
+>
+
+

Negative indices are not allowed in index matrices. NA and zero +values are allowed: rows in the index matrix containing a zero are +ignored, and rows containing an NA produce an NA in the +result. +

+ +

As a less trivial example, suppose we wish to generate an (unreduced) +design matrix for a block design defined by factors blocks +(b levels) and varieties (v levels). Further +suppose there are n plots in the experiment. We could proceed as +follows: +

+
+
> Xb <- matrix(0, n, b)
+> Xv <- matrix(0, n, v)
+> ib <- cbind(1:n, blocks)
+> iv <- cbind(1:n, varieties)
+> Xb[ib] <- 1
+> Xv[iv] <- 1
+> X <- cbind(Xb, Xv)
+
+ +

To construct the incidence matrix, N say, we could use +

+
+
> N <- crossprod(Xb, Xv)
+
+ + +

However a simpler direct way of producing this matrix is to use +table(): + +

+
+
> N <- table(blocks, varieties)
+
+ +

Index matrices must be numerical: any other form of matrix (e.g. a +logical or character matrix) supplied as a matrix is treated as an +indexing vector. +

+
+ + + +

5.4 The array() function

+ + +

As well as giving a vector structure a dim attribute, arrays can +be constructed from vectors by the array function, which has the +form +

+
+
> Z <- array(data_vector, dim_vector)
+
+ +

For example, if the vector h contains 24 or fewer, numbers then +the command +

+
+
> Z <- array(h, dim=c(3,4,2))
+
+ +

would use h to set up 3 by 4 by 2 array in +Z. If the size of h is exactly 24 the result is the same as +

+
+
> Z <- h ; dim(Z) <- c(3,4,2)
+
+ +

However if h is shorter than 24, its values are recycled from the +beginning again to make it up to size 24 (see The recycling rule) +but dim(h) <- c(3,4,2) would signal an error about mismatching +length. +As an extreme but common example +

+
+
> Z <- array(0, c(3,4,2))
+
+ +

makes Z an array of all zeros. +

+

At this point dim(Z) stands for the dimension vector +c(3,4,2), and Z[1:24] stands for the data vector as it was +in h, and Z[] with an empty subscript or Z with no +subscript stands for the entire array as an array. +

+

Arrays may be used in arithmetic expressions and the result is an array +formed by element-by-element operations on the data vector. The +dim attributes of operands generally need to be the same, and +this becomes the dimension vector of the result. So if A, +B and C are all similar arrays, then +

+
+
> D <- 2*A*B + C + 1
+
+ +

makes D a similar array with its data vector being the result of +the given element-by-element operations. However the precise rule +concerning mixed array and vector calculations has to be considered a +little more carefully. +

+ + + + +
+ + + +

5.4.1 Mixed vector and array arithmetic. The recycling rule

+ + +

The precise rule affecting element by element mixed calculations with +vectors and arrays is somewhat quirky and hard to find in the +references. From experience we have found the following to be a reliable +guide. +

+
    +
  • The expression is scanned from left to right. +
  • Any short vector operands are extended by recycling their values until +they match the size of any other operands. +
  • As long as short vectors and arrays only are encountered, the +arrays must all have the same dim attribute or an error results. +
  • Any vector operand longer than a matrix or array operand generates an error. +
  • If array structures are present and no error or coercion to vector has +been precipitated, the result is an array structure with the common +dim attribute of its array operands. +
+ +
+ + + +

5.5 The outer product of two arrays

+ + +

An important operation on arrays is the outer product. If +a and b are two numeric arrays, their outer product is an +array whose dimension vector is obtained by concatenating their two +dimension vectors (order is important), and whose data vector is got by +forming all possible products of elements of the data vector of a +with those of b. The outer product is formed by the special +operator %o%: + +

+
+
> ab <- a %o% b
+
+ +

An alternative is +

+
+
> ab <- outer(a, b, "*")
+
+ + +

The multiplication function can be replaced by an arbitrary function of +two variables. For example if we wished to evaluate the function +f(x; y) = cos(y)/(1 + x^2) +over a regular grid of values with x- and y-coordinates +defined by the R vectors x and y respectively, we could +proceed as follows: +

+
+
> f <- function(x, y) cos(y)/(1 + x^2)
+> z <- outer(x, y, f)
+
+ +

In particular the outer product of two ordinary vectors is a doubly +subscripted array (that is a matrix, of rank at most 1). Notice that +the outer product operator is of course non-commutative. Defining your +own R functions will be considered further in Writing your own functions. +

+ +

An example: Determinants of 2 by 2 single-digit matrices

+ +

As an artificial but cute example, consider the determinants of 2 +by 2 matrices [a, b; c, d] where each entry is a +non-negative integer in the range 0, 1, …, 9, that is a +digit. +

+

The problem is to find the determinants, ad - bc, of all possible +matrices of this form and represent the frequency with which each value +occurs as a high density plot. This amounts to finding the +probability distribution of the determinant if each digit is chosen +independently and uniformly at random. +

+

A neat way of doing this uses the outer() function twice: +

+
+
> d <- outer(0:9, 0:9)
+> fr <- table(outer(d, d, "-"))
+> plot(as.numeric(names(fr)), fr, type="h",
+       xlab="Determinant", ylab="Frequency")
+
+ +

Notice the coercion of the names attribute of the frequency table +to numeric in order to recover the range of the determinant values. The +“obvious” way of doing this problem with for loops, to be +discussed in Loops and conditional execution, is so inefficient as +to be impractical. +

+

It is also perhaps surprising that about 1 in 20 such matrices is +singular. +

+
+ + + +

5.6 Generalized transpose of an array

+ + +

The function aperm(a, perm) + +may be used to permute an array, a. The argument perm +must be a permutation of the integers {1, …, k}, where +k is the number of subscripts in a. The result of the +function is an array of the same size as a but with old dimension +given by perm[j] becoming the new j-th dimension. The +easiest way to think of this operation is as a generalization of +transposition for matrices. Indeed if A is a matrix, (that is, a +doubly subscripted array) then B given by +

+
+
> B <- aperm(A, c(2,1))
+
+ +

is just the transpose of A. For this special case a simpler +function t() + +is available, so we could have used B <- t(A). +

+
+ + + +

5.7 Matrix facilities

+ + + +

As noted above, a matrix is just an array with two subscripts. However +it is such an important special case it needs a separate discussion. +R contains many operators and functions that are available only for +matrices. For example t(X) is the matrix transpose function, as +noted above. The functions nrow(A) and ncol(A) give the +number of rows and columns in the matrix A respectively. + + +

+ + + + + + + + +
+ + + +

5.7.1 Matrix multiplication

+ + +

The operator %*% is used for matrix multiplication. + +An n by 1 or 1 by n matrix may of course be +used as an n-vector if in the context such is appropriate. +Conversely, vectors which occur in matrix multiplication expressions are +automatically promoted either to row or column vectors, whichever is +multiplicatively coherent, if possible, (although this is not always +unambiguously possible, as we see later). +

+

If, for example, A and B are square matrices of the same +size, then +

+
+
> A * B
+
+ +

is the matrix of element by element products and +

+
+
> A %*% B
+
+ +

is the matrix product. If x is a vector, then +

+
+
> x %*% A %*% x
+
+ +

is a quadratic form.16 +

+ +

The function crossprod() forms “crossproducts”, meaning that +crossprod(X, y) is the same as t(X) %*% y but the +operation is more efficient. If the second argument to +crossprod() is omitted it is taken to be the same as the first. +

+ +

The meaning of diag() depends on its argument. diag(v), +where v is a vector, gives a diagonal matrix with elements of the +vector as the diagonal entries. On the other hand diag(M), where +M is a matrix, gives the vector of main diagonal entries of +M. This is the same convention as that used for diag() in +MATLAB. Also, somewhat confusingly, if k is a single +numeric value then diag(k) is the k by k identity +matrix! +

+
+ + + +

5.7.2 Linear equations and inversion

+ + + +

Solving linear equations is the inverse of matrix multiplication. +When after +

+
+
> b <- A %*% x
+
+ +

only A and b are given, the vector x is the +solution of that linear equation system. In R, +

+
+
> solve(A,b)
+
+ +

solves the system, returning x (up to some accuracy loss). +Note that in linear algebra, formally +x = A^{-1} %*% b +where +A^{-1} denotes the inverse of +A, which can be computed by +

+
+
solve(A)
+
+ +

but rarely is needed. Numerically, it is both inefficient and +potentially unstable to compute x <- solve(A) %*% b instead of +solve(A,b). +

+

The quadratic form  x %*% A^{-1} %*% +x   which is used in multivariate computations, should be computed by +something like17 x %*% solve(A,x), rather +than computing the inverse of A. +

+
+ + + +

5.7.3 Eigenvalues and eigenvectors

+ + + +

The function eigen(Sm) calculates the eigenvalues and +eigenvectors of a symmetric matrix Sm. The result of this +function is a list of two components named values and +vectors. The assignment +

+
+
> ev <- eigen(Sm)
+
+ +

will assign this list to ev. Then ev$val is the vector of +eigenvalues of Sm and ev$vec is the matrix of +corresponding eigenvectors. Had we only needed the eigenvalues we could +have used the assignment: +

+
+
> evals <- eigen(Sm)$values
+
+ +

evals now holds the vector of eigenvalues and the second +component is discarded. If the expression +

+
+
> eigen(Sm)
+
+ +

is used by itself as a command the two components are printed, with +their names. For large matrices it is better to avoid computing the +eigenvectors if they are not needed by using the expression +

+
+
> evals <- eigen(Sm, only.values = TRUE)$values
+
+ + +
+ + + +

5.7.4 Singular value decomposition and determinants

+ + + +

The function svd(M) takes an arbitrary matrix argument, M, +and calculates the singular value decomposition of M. This +consists of a matrix of orthonormal columns U with the same +column space as M, a second matrix of orthonormal columns +V whose column space is the row space of M and a diagonal +matrix of positive entries D such that M = U %*% D %*% +t(V). D is actually returned as a vector of the diagonal +elements. The result of svd(M) is actually a list of three +components named d, u and v, with evident meanings. +

+

If M is in fact square, then, it is not hard to see that +

+
+
> absdetM <- prod(svd(M)$d)
+
+ +

calculates the absolute value of the determinant of M. If this +calculation were needed often with a variety of matrices it could be +defined as an R function +

+
+
> absdet <- function(M) prod(svd(M)$d)
+
+ + +

after which we could use absdet() as just another R function. +As a further trivial but potentially useful example, you might like to +consider writing a function, say tr(), to calculate the trace of +a square matrix. [Hint: You will not need to use an explicit loop. +Look again at the diag() function.] +

+ + +

R has a builtin function det to calculate a determinant, +including the sign, and another, determinant, to give the sign +and modulus (optionally on log scale), +

+ +
+ + + +

5.7.5 Least squares fitting and the QR decomposition

+ + + +

The function lsfit() returns a list giving results of a least +squares fitting procedure. An assignment such as +

+
+
> ans <- lsfit(X, y)
+
+ + +

gives the results of a least squares fit where y is the vector of +observations and X is the design matrix. See the help facility +for more details, and also for the follow-up function ls.diag() +for, among other things, regression diagnostics. Note that a grand mean +term is automatically included and need not be included explicitly as a +column of X. Further note that you almost always will prefer +using lm(.) (see Linear models) to lsfit() for +regression modelling. +

+ +

Another closely related function is qr() and its allies. +Consider the following assignments +

+
+
> Xplus <- qr(X)
+> b <- qr.coef(Xplus, y)
+> fit <- qr.fitted(Xplus, y)
+> res <- qr.resid(Xplus, y)
+
+ +

These compute the orthogonal projection of y onto the range of +X in fit, the projection onto the orthogonal complement in +res and the coefficient vector for the projection in b, +that is, b is essentially the result of the MATLAB +‘backslash’ operator. +

+

It is not assumed that X has full column rank. Redundancies will +be discovered and removed as they are found. +

+

This alternative is the older, low-level way to perform least squares +calculations. Although still useful in some contexts, it would now +generally be replaced by the statistical models features, as will be +discussed in Statistical models in R. +

+ +
+ + + +

5.8 Forming partitioned matrices, cbind() and rbind()

+ + + +

As we have already seen informally, matrices can be built up from other +vectors and matrices by the functions cbind() and rbind(). +Roughly cbind() forms matrices by binding together matrices +horizontally, or column-wise, and rbind() vertically, or +row-wise. +

+

In the assignment +

+
+
> X <- cbind(arg_1, arg_2, arg_3, …)
+
+ +

the arguments to cbind() must be either vectors of any length, or +matrices with the same column size, that is the same number of rows. +The result is a matrix with the concatenated arguments arg_1, +arg_2, … forming the columns. +

+

If some of the arguments to cbind() are vectors they may be +shorter than the column size of any matrices present, in which case they +are cyclically extended to match the matrix column size (or the length +of the longest vector if no matrices are given). +

+

The function rbind() does the corresponding operation for rows. +In this case any vector argument, possibly cyclically extended, are of +course taken as row vectors. +

+

Suppose X1 and X2 have the same number of rows. To +combine these by columns into a matrix X, together with an +initial column of 1s we can use +

+
+
> X <- cbind(1, X1, X2)
+
+ +

The result of rbind() or cbind() always has matrix status. +Hence cbind(x) and rbind(x) are possibly the simplest ways +explicitly to allow the vector x to be treated as a column or row +matrix respectively. +

+
+ + + +

5.9 The concatenation function, c(), with arrays

+ +

It should be noted that whereas cbind() and rbind() are +concatenation functions that respect dim attributes, the basic +c() function does not, but rather clears numeric objects of all +dim and dimnames attributes. This is occasionally useful +in its own right. +

+

The official way to coerce an array back to a simple vector object is to +use as.vector() +

+
+
> vec <- as.vector(X)
+
+ + +

However a similar result can be achieved by using c() with just +one argument, simply for this side-effect: +

+
+
> vec <- c(X)
+
+ + +

There are slight differences between the two, but ultimately the choice +between them is largely a matter of style (with the former being +preferable). +

+
+ + + +

5.10 Frequency tables from factors

+ + +

Recall that a factor defines a partition into groups. Similarly a pair +of factors defines a two way cross classification, and so on. + +The function table() allows frequency tables to be calculated +from equal length factors. If there are k factor arguments, +the result is a k-way array of frequencies. +

+

Suppose, for example, that statef is a factor giving the state +code for each entry in a data vector. The assignment +

+
+
> statefr <- table(statef)
+
+ +

gives in statefr a table of frequencies of each state in the +sample. The frequencies are ordered and labelled by the levels +attribute of the factor. This simple case is equivalent to, but more +convenient than, +

+
+
> statefr <- tapply(statef, statef, length)
+
+ +

Further suppose that incomef is a factor giving a suitably +defined “income class” for each entry in the data vector, for example +with the cut() function: +

+
+
> factor(cut(incomes, breaks = 35+10*(0:7))) -> incomef
+
+ + +

Then to calculate a two-way table of frequencies: +

+
+
> table(incomef,statef)
+         statef
+incomef   act nsw nt qld sa tas vic wa
+  (35,45]   1   1  0   1  0   0   1  0
+  (45,55]   1   1  1   1  2   0   1  3
+  (55,65]   0   3  1   3  2   2   2  1
+  (65,75]   0   1  0   0  0   0   1  0
+
+ +

Extension to higher-way frequency tables is immediate. +

+
+ + + +

6 Lists and data frames

+ + + + + + + +
+ + + +

6.1 Lists

+ + +

An R list is an object consisting of an ordered collection of +objects known as its components. +

+

There is no particular need for the components to be of the same mode or +type, and, for example, a list could consist of a numeric vector, a +logical value, a matrix, a complex vector, a character array, a +function, and so on. Here is a simple example of how to make a list: +

+
+
> Lst <- list(name="Fred", wife="Mary", no.children=3,
+              child.ages=c(4,7,9))
+
+ + +

Components are always numbered and may always be referred to as +such. Thus if Lst is the name of a list with four components, +these may be individually referred to as Lst[[1]], +Lst[[2]], Lst[[3]] and Lst[[4]]. If, further, +Lst[[4]] is a vector subscripted array then Lst[[4]][1] is +its first entry. +

+

If Lst is a list, then the function length(Lst) gives the +number of (top level) components it has. +

+

Components of lists may also be named, and in this case the +component may be referred to either by giving the component name as a +character string in place of the number in double square brackets, or, +more conveniently, by giving an expression of the form +

+
+
> name$component_name
+
+ +

for the same thing. +

+

This is a very useful convention as it makes it easier to get the right +component if you forget the number. +

+

So in the simple example given above: +

+

Lst$name is the same as Lst[[1]] and is the string +"Fred", +

+

Lst$wife is the same as Lst[[2]] and is the string +"Mary", +

+

Lst$child.ages[1] is the same as Lst[[4]][1] and is the +number 4. +

+

Additionally, one can also use the names of the list components in +double square brackets, i.e., Lst[["name"]] is the same as +Lst$name. This is especially useful, when the name of the +component to be extracted is stored in another variable as in +

+
+
> x <- "name"; Lst[[x]]
+
+ +

It is very important to distinguish Lst[[1]] from Lst[1]. +‘[[]]’ is the operator used to select a single +element, whereas ‘[]’ is a general subscripting +operator. Thus the former is the first object in the list +Lst, and if it is a named list the name is not included. +The latter is a sublist of the list Lst consisting of the +first entry only. If it is a named list, the names are transferred to +the sublist. +

+

The names of components may be abbreviated down to the minimum number of +letters needed to identify them uniquely. Thus Lst$coefficients +may be minimally specified as Lst$coe and Lst$covariance +as Lst$cov. +

+

The vector of names is in fact simply an attribute of the list like any +other and may be handled as such. Other structures besides lists may, +of course, similarly be given a names attribute also. +

+
+ +
+

+Next: , Previous: , Up: Lists and data frames   [Contents][Index]

+
+ +

6.2 Constructing and modifying lists

+ +

New lists may be formed from existing objects by the function +list(). An assignment of the form +

+
+
> Lst <- list(name_1=object_1, , name_m=object_m)
+
+ +

sets up a list Lst of m components using object_1, +…, object_m for the components and giving them names as +specified by the argument names, (which can be freely chosen). If these +names are omitted, the components are numbered only. The components +used to form the list are copied when forming the new list and +the originals are not affected. +

+

Lists, like any subscripted object, can be extended by specifying +additional components. For example +

+
+
> Lst[5] <- list(matrix=Mat)
+
+ + + + + +
+ + + +

6.2.1 Concatenating lists

+ + + +

When the concatenation function c() is given list arguments, the +result is an object of mode list also, whose components are those of the +argument lists joined together in sequence. +

+
+
> list.ABC <- c(list.A, list.B, list.C)
+
+ +

Recall that with vector objects as arguments the concatenation function +similarly joined together all arguments into a single vector structure. +In this case all other attributes, such as dim attributes, are +discarded. +

+ +
+ + + +

6.3 Data frames

+ + +

A data frame is a list with class "data.frame". There are +restrictions on lists that may be made into data frames, namely +

+
    +
  • The components must be vectors (numeric, character, or logical), +factors, numeric matrices, lists, or other data frames. +
  • Matrices, lists, and data frames provide as many variables to the new +data frame as they have columns, elements, or variables, respectively. +
  • Numeric vectors, logicals and factors are included as is, and by +default18 character vectors are coerced to be +factors, whose levels are the unique values appearing in the vector. +
  • Vector structures appearing as variables of the data frame must all have +the same length, and matrix structures must all have the same +row size. +
+ +

A data frame may for many purposes be regarded as a matrix with columns +possibly of differing modes and attributes. It may be displayed in +matrix form, and its rows and columns extracted using matrix indexing +conventions. +

+ + + + + + + + +
+ +
+

+Next: , Previous: , Up: Data frames   [Contents][Index]

+
+ +

6.3.1 Making data frames

+ +

Objects satisfying the restrictions placed on the columns (components) +of a data frame may be used to form one using the function +data.frame: + +

+
+
> accountants <- data.frame(home=statef, loot=incomes, shot=incomef)
+
+ +

A list whose components conform to the restrictions of a data frame may +be coerced into a data frame using the function +as.data.frame() + +

+

The simplest way to construct a data frame from scratch is to use the +read.table() function to read an entire data frame from an +external file. This is discussed further in Reading data from files. +

+
+ + + +

6.3.2 attach() and detach()

+ + + +

The $ notation, such as accountants$home, for list +components is not always very convenient. A useful facility would be +somehow to make the components of a list or data frame temporarily +visible as variables under their component name, without the need to +quote the list name explicitly each time. +

+

The attach() function takes a ‘database’ such as a list or data +frame as its argument. Thus suppose lentils is a +data frame with three variables lentils$u, lentils$v, +lentils$w. The attach +

+
+
> attach(lentils)
+
+ +

places the data frame in the search path at position 2, and provided +there are no variables u, v or w in position 1, +u, v and w are available as variables from the data +frame in their own right. At this point an assignment such as +

+
+
> u <- v+w
+
+ +

does not replace the component u of the data frame, but rather +masks it with another variable u in the working directory at +position 1 on the search path. To make a permanent change to the +data frame itself, the simplest way is to resort once again to the +$ notation: +

+
+
> lentils$u <- v+w
+
+ +

However the new value of component u is not visible until the +data frame is detached and attached again. +

+

To detach a data frame, use the function +

+
+
> detach()
+
+ +

More precisely, this statement detaches from the search path the entity +currently at position 2. Thus in the present context the variables +u, v and w would be no longer visible, except under +the list notation as lentils$u and so on. Entities at positions +greater than 2 on the search path can be detached by giving their number +to detach, but it is much safer to always use a name, for example +by detach(lentils) or detach("lentils") +

+
+

Note: In R lists and data frames can only be attached at position 2 or +above, and what is attached is a copy of the original object. +You can alter the attached values via assign, but the +original list or data frame is unchanged. +

+ +
+ + + +

6.3.3 Working with data frames

+ +

A useful convention that allows you to work with many different problems +comfortably together in the same working directory is +

+
    +
  • gather together all variables for any well defined and separate problem +in a data frame under a suitably informative name; +
  • when working with a problem attach the appropriate data frame at +position 2, and use the working directory at level 1 for +operational quantities and temporary variables; +
  • before leaving a problem, add any variables you wish to keep for future +reference to the data frame using the $ form of assignment, and +then detach(); +
  • finally remove all unwanted variables from the working directory and +keep it as clean of left-over temporary variables as possible. +
+ +

In this way it is quite simple to work with many problems in the same +directory, all of which have variables named x, y and +z, for example. +

+
+ + + +

6.3.4 Attaching arbitrary lists

+ +

attach() is a generic function that allows not only directories +and data frames to be attached to the search path, but other classes of +object as well. In particular any object of mode "list" may be +attached in the same way: +

+
+
> attach(any.old.list)
+
+ +

Anything that has been attached can be detached by detach, by +position number or, preferably, by name. +

+
+ + + +

6.3.5 Managing the search path

+ + + +

The function search shows the current search path and so is +a very useful way to keep track of which data frames and lists (and +packages) have been attached and detached. Initially it gives +

+
+
> search()
+[1] ".GlobalEnv"   "Autoloads"    "package:base"
+
+

where .GlobalEnv is the workspace.19 +

+

After lentils is attached we have +

+
+
> search()
+[1] ".GlobalEnv"   "lentils"      "Autoloads"    "package:base"
+> ls(2)
+[1] "u" "v" "w"
+
+ +

and as we see ls (or objects) can be used to examine the +contents of any position on the search path. +

+

Finally, we detach the data frame and confirm it has been removed from +the search path. +

+
+
> detach("lentils")
+> search()
+[1] ".GlobalEnv"   "Autoloads"    "package:base"
+
+ +
+ + + +

7 Reading data from files

+ + +

Large data objects will usually be read as values from external files +rather than entered during an R session at the keyboard. R input +facilities are simple and their requirements are fairly strict and even +rather inflexible. There is a clear presumption by the designers of +R that you will be able to modify your input files using other tools, +such as file editors or Perl20 to fit in with the +requirements of R. Generally this is very simple. +

+

If variables are to be held mainly in data frames, as we strongly +suggest they should be, an entire data frame can be read directly with +the read.table() function. There is also a more primitive input +function, scan(), that can be called directly. +

+

For more details on importing data into R and also exporting data, +see the R Data Import/Export manual. +

+ + + + + + + +
+ + + +

7.1 The read.table() function

+ + +

To read an entire data frame directly, the external file will normally +have a special form. +

+
    +
  • The first line of the file should have a name for each variable +in the data frame. + +
  • Each additional line of the file has as its first item a row label +and the values for each variable. +
+ +

If the file has one fewer item in its first line than in its second, this +arrangement is presumed to be in force. So the first few lines of a file +to be read as a data frame might look as follows. +

+
+
+
+
Input file form with names and row labels:
+
+     Price    Floor     Area   Rooms     Age  Cent.heat
+01   52.00    111.0      830     5       6.2      no
+02   54.75    128.0      710     5       7.5      no
+03   57.50    101.0     1000     5       4.2      no
+04   57.50    131.0      690     6       8.8      no
+05   59.75     93.0      900     5       1.9     yes
+...
+
+
+
+ +

By default numeric items (except row labels) are read as numeric +variables and non-numeric variables, such as Cent.heat in the +example, as factors. This can be changed if necessary. +

+

The function read.table() can then be used to read the data frame +directly +

+
+
> HousePrice <- read.table("houses.data")
+
+ +

Often you will want to omit including the row labels directly and use the +default labels. In this case the file may omit the row label column as in +the following. +

+
+
+
+
Input file form without row labels:
+
+Price    Floor     Area   Rooms     Age  Cent.heat
+52.00    111.0      830     5       6.2      no
+54.75    128.0      710     5       7.5      no
+57.50    101.0     1000     5       4.2      no
+57.50    131.0      690     6       8.8      no
+59.75     93.0      900     5       1.9     yes
+...
+
+
+
+ +

The data frame may then be read as +

+
+
> HousePrice <- read.table("houses.data", header=TRUE)
+
+ +

where the header=TRUE option specifies that the first line is a +line of headings, and hence, by implication from the form of the file, +that no explicit row labels are given. +

+ + + + +
+ + + +

7.2 The scan() function

+ + +

Suppose the data vectors are of equal length and are to be read in +parallel. Further suppose that there are three vectors, the first of +mode character and the remaining two of mode numeric, and the file is +input.dat. The first step is to use scan() to read in the +three vectors as a list, as follows +

+
+
> inp <- scan("input.dat", list("",0,0))
+
+ +

The second argument is a dummy list structure that establishes the mode +of the three vectors to be read. The result, held in inp, is a +list whose components are the three vectors read in. To separate the +data items into three separate vectors, use assignments like +

+
+
> label <- inp[[1]]; x <- inp[[2]]; y <- inp[[3]]
+
+ +

More conveniently, the dummy list can have named components, in which +case the names can be used to access the vectors read in. For example +

+
+
> inp <- scan("input.dat", list(id="", x=0, y=0))
+
+ +

If you wish to access the variables separately they may either be +re-assigned to variables in the working frame: +

+
+
> label <- inp$id; x <- inp$x; y <- inp$y
+
+ +

or the list may be attached at position 2 of the search path +(see Attaching arbitrary lists). +

+

If the second argument is a single value and not a list, a single vector +is read in, all components of which must be of the same mode as the +dummy value. +

+
+
> X <- matrix(scan("light.dat", 0), ncol=5, byrow=TRUE)
+
+ +

There are more elaborate input facilities available and these are +detailed in the manuals. +

+
+ + + +

7.3 Accessing builtin datasets

+ + + +

Around 100 datasets are supplied with R (in package datasets), +and others are available in packages (including the recommended packages +supplied with R). To see the list of datasets currently available +use +

+
+
data()
+
+ +

All the datasets supplied with R are available directly by name. +However, many packages still use the obsolete convention in which +data was also used to load datasets into R, for example +

+
+
data(infert)
+
+ +

and this can still be used with the standard packages (as in this +example). In most cases this will load an R object of the same name. +However, in a few cases it loads several objects, so see the on-line +help for the object to see what to expect. +

+ +

7.3.1 Loading data from other R packages

+ +

To access data from a particular package, use the package +argument, for example +

+
+
data(package="rpart")
+data(Puromycin, package="datasets")
+
+ +

If a package has been attached by library, its datasets are +automatically included in the search. +

+

User-contributed packages can be a rich source of datasets. +

+
+ + + +

7.4 Editing data

+ + +

When invoked on a data frame or matrix, edit brings up a separate +spreadsheet-like environment for editing. This is useful for making +small changes once a data set has been read. The command +

+
+
> xnew <- edit(xold)
+
+ +

will allow you to edit your data set xold, and on completion the +changed object is assigned to xnew. If you want to alter the +original dataset xold, the simplest way is to use +fix(xold), which is equivalent to xold <- edit(xold). +

+

Use +

+
+
> xnew <- edit(data.frame())
+
+ +

to enter new data via the spreadsheet interface. +

+ +
+ + + +

8 Probability distributions

+ + + + + + + + +
+ + + +

8.1 R as a set of statistical tables

+ +

One convenient use of R is to provide a comprehensive set of +statistical tables. Functions are provided to evaluate the cumulative +distribution function P(X <= x), +the probability density function and the quantile function (given +q, the smallest x such that P(X <= x) > q), +and to simulate from the distribution. +

+
+ + + + + + + + + + + + + + + + + + + + + +
DistributionR nameadditional arguments
betabetashape1, shape2, ncp
binomialbinomsize, prob
Cauchycauchylocation, scale
chi-squaredchisqdf, ncp
exponentialexprate
Ffdf1, df2, ncp
gammagammashape, scale
geometricgeomprob
hypergeometrichyperm, n, k
log-normallnormmeanlog, sdlog
logisticlogislocation, scale
negative binomialnbinomsize, prob
normalnormmean, sd
Poissonpoislambda
signed ranksignrankn
Student’s ttdf, ncp
uniformunifmin, max
Weibullweibullshape, scale
Wilcoxonwilcoxm, n
+
+ +

Prefix the name given here by ‘d’ for the density, ‘p’ for the +CDF, ‘q’ for the quantile function and ‘r’ for simulation +(random deviates). The first argument is x for +dxxx, q for pxxx, p for +qxxx and n for rxxx (except for +rhyper, rsignrank and rwilcox, for which it is +nn). In not quite all cases is the non-centrality parameter +ncp currently available: see the on-line help for details. +

+

The pxxx and qxxx functions all have logical +arguments lower.tail and log.p and the dxxx +ones have log. This allows, e.g., getting the cumulative (or +“integrated”) hazard function, H(t) = - log(1 - F(t)), by +

+
+
 - pxxx(t, ..., lower.tail = FALSE, log.p = TRUE)
+
+ +

or more accurate log-likelihoods (by dxxx(..., log = +TRUE)), directly. +

+

In addition there are functions ptukey and qtukey for the +distribution of the studentized range of samples from a normal +distribution, and dmultinom and rmultinom for the +multinomial distribution. Further distributions are available in +contributed packages, notably SuppDists. +

+

Here are some examples +

+
+
> ## 2-tailed p-value for t distribution
+> 2*pt(-2.43, df = 13)
+> ## upper 1% point for an F(2, 7) distribution
+> qf(0.01, 2, 7, lower.tail = FALSE)
+
+ +

See the on-line help on RNG for how random-number generation is +done in R. +

+
+ + + +

8.2 Examining the distribution of a set of data

+ +

Given a (univariate) set of data we can examine its distribution in a +large number of ways. The simplest is to examine the numbers. Two +slightly different summaries are given by summary and +fivenum + + +and a display of the numbers by stem (a “stem and leaf” plot). + +

+
+
> attach(faithful)
+> summary(eruptions)
+   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
+  1.600   2.163   4.000   3.488   4.454   5.100
+> fivenum(eruptions)
+[1] 1.6000 2.1585 4.0000 4.4585 5.1000
+> stem(eruptions)
+
+  The decimal point is 1 digit(s) to the left of the |
+
+  16 | 070355555588
+  18 | 000022233333335577777777888822335777888
+  20 | 00002223378800035778
+  22 | 0002335578023578
+  24 | 00228
+  26 | 23
+  28 | 080
+  30 | 7
+  32 | 2337
+  34 | 250077
+  36 | 0000823577
+  38 | 2333335582225577
+  40 | 0000003357788888002233555577778
+  42 | 03335555778800233333555577778
+  44 | 02222335557780000000023333357778888
+  46 | 0000233357700000023578
+  48 | 00000022335800333
+  50 | 0370
+
+ +

A stem-and-leaf plot is like a histogram, and R has a function +hist to plot histograms. + +

+
+
> hist(eruptions)
+## make the bins smaller, make a plot of density
+> hist(eruptions, seq(1.6, 5.2, 0.2), prob=TRUE)
+> lines(density(eruptions, bw=0.1))
+> rug(eruptions) # show the actual data points
+
+ + + +

More elegant density plots can be made by density, and we added a +line produced by density in this example. The bandwidth +bw was chosen by trial-and-error as the default gives too much +smoothing (it usually does for “interesting” densities). (Better +automated methods of bandwidth choice are available, and in this example +bw = "SJ" gives a good result.) +

+images/hist + +

We can plot the empirical cumulative distribution function by using the +function ecdf. + + +

+
+
> plot(ecdf(eruptions), do.points=FALSE, verticals=TRUE)
+
+ +

This distribution is obviously far from any standard distribution. +How about the right-hand mode, say eruptions of longer than 3 minutes? +Let us fit a normal distribution and overlay the fitted CDF. +

+
+
> long <- eruptions[eruptions > 3]
+> plot(ecdf(long), do.points=FALSE, verticals=TRUE)
+> x <- seq(3, 5.4, 0.01)
+> lines(x, pnorm(x, mean=mean(long), sd=sqrt(var(long))), lty=3)
+
+ +images/ecdf + +

Quantile-quantile (Q-Q) plots can help us examine this more carefully. + + + +

+
+
par(pty="s")       # arrange for a square figure region
+qqnorm(long); qqline(long)
+
+ +

which shows a reasonable fit but a shorter right tail than one would +expect from a normal distribution. Let us compare this with some +simulated data from a t distribution +

+images/QQ + +
+
x <- rt(250, df = 5)
+qqnorm(x); qqline(x)
+
+ +

which will usually (if it is a random sample) show longer tails than +expected for a normal. We can make a Q-Q plot against the generating +distribution by +

+
+
qqplot(qt(ppoints(250), df = 5), x, xlab = "Q-Q plot for t dsn")
+qqline(x)
+
+ +

Finally, we might want a more formal test of agreement with normality +(or not). R provides the Shapiro-Wilk test + + +

+
+
> shapiro.test(long)
+
+         Shapiro-Wilk normality test
+
+data:  long
+W = 0.9793, p-value = 0.01052
+
+ +

and the Kolmogorov-Smirnov test + + +

+
+
> ks.test(long, "pnorm", mean = mean(long), sd = sqrt(var(long)))
+
+         One-sample Kolmogorov-Smirnov test
+
+data:  long
+D = 0.0661, p-value = 0.4284
+alternative hypothesis: two.sided
+
+ +

(Note that the distribution theory is not valid here as we +have estimated the parameters of the normal distribution from the same +sample.) +

+
+ + + +

8.3 One- and two-sample tests

+ + +

So far we have compared a single sample to a normal distribution. A +much more common operation is to compare aspects of two samples. Note +that in R, all “classical” tests including the ones used below are +in package stats which is normally loaded. +

+

Consider the following sets of data on the latent heat of the fusion of +ice (cal/gm) from Rice (1995, p.490) +

+
+
Method A: 79.98 80.04 80.02 80.04 80.03 80.03 80.04 79.97
+          80.05 80.03 80.02 80.00 80.02
+Method B: 80.02 79.94 79.98 79.97 79.97 80.03 79.95 79.97
+
+ +

Boxplots provide a simple graphical comparison of the two samples. +

+
+
A <- scan()
+79.98 80.04 80.02 80.04 80.03 80.03 80.04 79.97
+80.05 80.03 80.02 80.00 80.02
+
+B <- scan()
+80.02 79.94 79.98 79.97 79.97 80.03 79.95 79.97
+
+boxplot(A, B)
+
+ + + +

which indicates that the first group tends to give higher results than +the second. +

+images/ice + +

To test for the equality of the means of the two examples, we can use +an unpaired t-test by + + +

+
+
> t.test(A, B)
+
+         Welch Two Sample t-test
+
+data:  A and B
+t = 3.2499, df = 12.027, p-value = 0.00694
+alternative hypothesis: true difference in means is not equal to 0
+95 percent confidence interval:
+ 0.01385526 0.07018320
+sample estimates:
+mean of x mean of y
+ 80.02077  79.97875
+
+ +

which does indicate a significant difference, assuming normality. By +default the R function does not assume equality of variances in the +two samples (in contrast to the similar S-PLUS t.test +function). We can use the F test to test for equality in the variances, +provided that the two samples are from normal populations. +

+
+
> var.test(A, B)
+
+         F test to compare two variances
+
+data:  A and B
+F = 0.5837, num df = 12, denom df =  7, p-value = 0.3938
+alternative hypothesis: true ratio of variances is not equal to 1
+95 percent confidence interval:
+ 0.1251097 2.1052687
+sample estimates:
+ratio of variances
+         0.5837405
+
+ + +

which shows no evidence of a significant difference, and so we can use +the classical t-test that assumes equality of the variances. +

+
+
> t.test(A, B, var.equal=TRUE)
+
+         Two Sample t-test
+
+data:  A and B
+t = 3.4722, df = 19, p-value = 0.002551
+alternative hypothesis: true difference in means is not equal to 0
+95 percent confidence interval:
+ 0.01669058 0.06734788
+sample estimates:
+mean of x mean of y
+ 80.02077  79.97875
+
+ +

All these tests assume normality of the two samples. The two-sample +Wilcoxon (or Mann-Whitney) test only assumes a common continuous +distribution under the null hypothesis. +

+ + +
+
> wilcox.test(A, B)
+
+         Wilcoxon rank sum test with continuity correction
+
+data:  A and B
+W = 89, p-value = 0.007497
+alternative hypothesis: true location shift is not equal to 0
+
+Warning message:
+Cannot compute exact p-value with ties in: wilcox.test(A, B)
+
+ +

Note the warning: there are several ties in each sample, which suggests +strongly that these data are from a discrete distribution (probably due +to rounding). +

+

There are several ways to compare graphically the two samples. We have +already seen a pair of boxplots. The following +

+
+
> plot(ecdf(A), do.points=FALSE, verticals=TRUE, xlim=range(A, B))
+> plot(ecdf(B), do.points=FALSE, verticals=TRUE, add=TRUE)
+
+ +

will show the two empirical CDFs, and qqplot will perform a Q-Q +plot of the two samples. The Kolmogorov-Smirnov test is of the maximal +vertical distance between the two ecdf’s, assuming a common continuous +distribution: +

+
+
> ks.test(A, B)
+
+         Two-sample Kolmogorov-Smirnov test
+
+data:  A and B
+D = 0.5962, p-value = 0.05919
+alternative hypothesis: two-sided
+
+Warning message:
+cannot compute correct p-values with ties in: ks.test(A, B)
+
+ +
+ + + +

9 Grouping, loops and conditional execution

+ + + + + + + +
+ + + +

9.1 Grouped expressions

+ + +

R is an expression language in the sense that its only command type +is a function or expression which returns a result. Even an assignment +is an expression whose result is the value assigned, and it may be used +wherever any expression may be used; in particular multiple assignments +are possible. +

+

Commands may be grouped together in braces, {expr_1; +; expr_m}, in which case the value of the group +is the result of the last expression in the group evaluated. Since such +a group is also an expression it may, for example, be itself included in +parentheses and used a part of an even larger expression, and so on. +

+
+ + + +

9.2 Control statements

+ + + + + + + +
+ + + +

9.2.1 Conditional execution: if statements

+ + +

The language has available a conditional construction of the form +

+
+
> if (expr_1) expr_2 else expr_3
+
+ + + +

where expr_1 must evaluate to a single logical value and the +result of the entire expression is then evident. +

+ + +

The “short-circuit” operators && and || are often used +as part of the condition in an if statement. Whereas & +and | apply element-wise to vectors, && and || +apply to vectors of length one, and only evaluate their second argument +if necessary. +

+ +

There is a vectorized version of the if/else construct, +the ifelse function. This has the form ifelse(condition, a, +b) and returns a vector of the length of its longest argument, with +elements a[i] if condition[i] is true, otherwise +b[i]. +

+ +
+ + + +

9.2.2 Repetitive execution: for loops, repeat and while

+ + +

There is also a for loop construction which has the form +

+
+
> for (name in expr_1) expr_2
+
+ +

where name is the loop variable. expr_1 is a +vector expression, (often a sequence like 1:20), and +expr_2 is often a grouped expression with its sub-expressions +written in terms of the dummy name. expr_2 is repeatedly +evaluated as name ranges through the values in the vector result +of expr_1. +

+

As an example, suppose ind is a vector of class indicators and we +wish to produce separate plots of y versus x within +classes. One possibility here is to use coplot(),21 +which will produce an array of plots corresponding to each level of the +factor. Another way to do this, now putting all plots on the one +display, is as follows: +

+
+
> xc <- split(x, ind)
+> yc <- split(y, ind)
+> for (i in 1:length(yc)) {
+    plot(xc[[i]], yc[[i]])
+    abline(lsfit(xc[[i]], yc[[i]]))
+  }
+
+ + + +

(Note the function split() which produces a list of vectors +obtained by splitting a larger vector according to the classes specified +by a factor. This is a useful function, mostly used in connection +with boxplots. See the help facility for further details.) +

+
+

Warning: for() loops are used in R code much less +often than in compiled languages. Code that takes a ‘whole object’ view +is likely to be both clearer and faster in R. +

+ +

Other looping facilities include the +

+
+
> repeat expr
+
+ + +

statement and the +

+
+
> while (condition) expr
+
+ + +

statement. +

+

The break statement can be used to terminate any loop, possibly +abnormally. This is the only way to terminate repeat loops. + +

+

The next statement can be used to discontinue one particular +cycle and skip to the “next”. + +

+

Control statements are most often used in connection with +functions which are discussed in Writing your own functions, and where more examples will emerge. +

+ +
+ + + +

10 Writing your own functions

+ + +

As we have seen informally along the way, the R language allows the +user to create objects of mode function. These are true R +functions that are stored in a special internal form and may be used in +further expressions and so on. In the process, the language gains +enormously in power, convenience and elegance, and learning to write +useful functions is one of the main ways to make your use of R +comfortable and productive. +

+

It should be emphasized that most of the functions supplied as part of +the R system, such as mean(), var(), +postscript() and so on, are themselves written in R and thus +do not differ materially from user written functions. +

+

A function is defined by an assignment of the form +

+
+
> name <- function(arg_1, arg_2, …) expression
+
+ + +

The expression is an R expression, (usually a grouped +expression), that uses the arguments, arg_i, to calculate a value. +The value of the expression is the value returned for the function. +

+

A call to the function then usually takes the form +name(expr_1, expr_2, …) and may occur +anywhere a function call is legitimate. +

+ + + + + + + + + + + + +
+ + + +

10.1 Simple examples

+ +

As a first example, consider a function to calculate the two sample +t-statistic, showing “all the steps”. This is an artificial +example, of course, since there are other, simpler ways of achieving the +same end. +

+

The function is defined as follows: +

+
+
> twosam <- function(y1, y2) {
+    n1  <- length(y1); n2  <- length(y2)
+    yb1 <- mean(y1);   yb2 <- mean(y2)
+    s1  <- var(y1);    s2  <- var(y2)
+    s <- ((n1-1)*s1 + (n2-1)*s2)/(n1+n2-2)
+    tst <- (yb1 - yb2)/sqrt(s*(1/n1 + 1/n2))
+    tst
+  }
+
+ +

With this function defined, you could perform two sample t-tests +using a call such as +

+
+
> tstat <- twosam(data$male, data$female); tstat
+
+ +

As a second example, consider a function to emulate directly the +MATLAB backslash command, which returns the coefficients of the +orthogonal projection of the vector y onto the column space of +the matrix, X. (This is ordinarily called the least squares +estimate of the regression coefficients.) This would ordinarily be +done with the qr() function; however this is sometimes a bit +tricky to use directly and it pays to have a simple function such as the +following to use it safely. +

+

Thus given a n by 1 vector y and an n by +p matrix X then X \ y is defined as +(X’X)^{-}X’y, where (X’X)^{-} +is a generalized inverse of X'X. +

+
+
> bslash <- function(X, y) {
+  X <- qr(X)
+  qr.coef(X, y)
+}
+
+ +

After this object is created it may be used in statements such as +

+
+
> regcoeff <- bslash(Xmat, yvar)
+
+ +

and so on. +

+

The classical R function lsfit() does this job quite well, and +more22. It in turn uses the functions qr() and qr.coef() +in the slightly counterintuitive way above to do this part of the +calculation. Hence there is probably some value in having just this +part isolated in a simple to use function if it is going to be in +frequent use. If so, we may wish to make it a matrix binary operator +for even more convenient use. +

+
+ + + +

10.2 Defining new binary operators

+ + +

Had we given the bslash() function a different name, namely one of +the form +

+
+
%anything%
+
+ +

it could have been used as a binary operator in expressions +rather than in function form. Suppose, for example, we choose ! +for the internal character. The function definition would then start as +

+
+
> "%!%" <- function(X, y) { … }
+
+ +

(Note the use of quote marks.) The function could then be used as +X %!% y. (The backslash symbol itself is not a convenient choice +as it presents special problems in this context.) +

+

The matrix multiplication operator, %*%, and the outer product +matrix operator %o% are other examples of binary operators +defined in this way. +

+
+ + + +

10.3 Named arguments and defaults

+ + + +

As first noted in Generating regular sequences, if arguments to +called functions are given in the “name=object” +form, they may be given in any order. Furthermore the argument sequence +may begin in the unnamed, positional form, and specify named arguments +after the positional arguments. +

+

Thus if there is a function fun1 defined by +

+
+
> fun1 <- function(data, data.frame, graph, limit) {
+    [function body omitted]
+  }
+
+ +

then the function may be invoked in several ways, for example +

+
+
> ans <- fun1(d, df, TRUE, 20)
+> ans <- fun1(d, df, graph=TRUE, limit=20)
+> ans <- fun1(data=d, limit=20, graph=TRUE, data.frame=df)
+
+ +

are all equivalent. +

+

In many cases arguments can be given commonly appropriate default +values, in which case they may be omitted altogether from the call when +the defaults are appropriate. For example, if fun1 were defined +as +

+
+
> fun1 <- function(data, data.frame, graph=TRUE, limit=20) { … }
+
+ +

it could be called as +

+
+
> ans <- fun1(d, df)
+
+ +

which is now equivalent to the three cases above, or as +

+
+
> ans <- fun1(d, df, limit=10)
+
+ +

which changes one of the defaults. +

+

It is important to note that defaults may be arbitrary expressions, even +involving other arguments to the same function; they are not restricted +to be constants as in our simple example here. +

+
+ + + +

10.4 The ‘’ argument

+ + +

Another frequent requirement is to allow one function to pass on +argument settings to another. For example many graphics functions use +the function par() and functions like plot() allow the +user to pass on graphical parameters to par() to control the +graphical output. (See The par() function, for more details on the +par() function.) This can be done by including an extra +argument, literally ‘’, of the function, which may then be +passed on. An outline example is given below. +

+
+
fun1 <- function(data, data.frame, graph=TRUE, limit=20, ...) {
+  [omitted statements]
+  if (graph)
+    par(pch="*", ...)
+  [more omissions]
+}
+
+ +

Less frequently, a function will need to refer to components of +‘’. The expression list(...) evaluates all such +arguments and returns them in a named list, while ..1, +..2, etc. evaluate them one at a time, with ‘..n’ +returning the n’th unmatched argument. +

+
+ + + +

10.5 Assignments within functions

+ +

Note that any ordinary assignments done within the function are +local and temporary and are lost after exit from the function. Thus +the assignment X <- qr(X) does not affect the value of the +argument in the calling program. +

+

To understand completely the rules governing the scope of R assignments +the reader needs to be familiar with the notion of an evaluation +frame. This is a somewhat advanced, though hardly difficult, +topic and is not covered further here. +

+

If global and permanent assignments are intended within a function, then +either the “superassignment” operator, <<- or the function +assign() can be used. See the help document for details. +S-PLUS users should be aware that <<- has different semantics +in R. These are discussed further in Scope. +

+
+ + + +

10.6 More advanced examples

+ + + + + + + +
+ + + +

10.6.1 Efficiency factors in block designs

+ +

As a more complete, if a little pedestrian, example of a function, +consider finding the efficiency factors for a block design. (Some +aspects of this problem have already been discussed in Index matrices.) +

+

A block design is defined by two factors, say blocks (b +levels) and varieties (v levels). If R and +K are the v by v and b by b +replications and block size matrices, respectively, and +N is the b by v incidence matrix, then the +efficiency factors are defined as the eigenvalues of the matrix +E = I_v - R^{-1/2}N’K^{-1}NR^{-1/2} = I_v - A’A, where +A = K^{-1/2}NR^{-1/2}. +One way to write the function is given below. +

+
+
> bdeff <- function(blocks, varieties) {
+    blocks <- as.factor(blocks)             # minor safety move
+    b <- length(levels(blocks))
+    varieties <- as.factor(varieties)       # minor safety move
+    v <- length(levels(varieties))
+    K <- as.vector(table(blocks))           # remove dim attr
+    R <- as.vector(table(varieties))        # remove dim attr
+    N <- table(blocks, varieties)
+    A <- 1/sqrt(K) * N * rep(1/sqrt(R), rep(b, v))
+    sv <- svd(A)
+    list(eff=1 - sv$d^2, blockcv=sv$u, varietycv=sv$v)
+}
+
+ +

It is numerically slightly better to work with the singular value +decomposition on this occasion rather than the eigenvalue routines. +

+

The result of the function is a list giving not only the efficiency +factors as the first component, but also the block and variety canonical +contrasts, since sometimes these give additional useful qualitative +information. +

+
+ + + +

10.6.2 Dropping all names in a printed array

+ +

For printing purposes with large matrices or arrays, it is often useful +to print them in close block form without the array names or numbers. +Removing the dimnames attribute will not achieve this effect, but +rather the array must be given a dimnames attribute consisting of +empty strings. For example to print a matrix, X +

+
+
> temp <- X
+> dimnames(temp) <- list(rep("", nrow(X)), rep("", ncol(X)))
+> temp; rm(temp)
+
+ +

This can be much more conveniently done using a function, +no.dimnames(), shown below, as a “wrap around” to achieve the +same result. It also illustrates how some effective and useful user +functions can be quite short. +

+
+
no.dimnames <- function(a) {
+  ## Remove all dimension names from an array for compact printing.
+  d <- list()
+  l <- 0
+  for(i in dim(a)) {
+    d[[l <- l + 1]] <- rep("", i)
+  }
+  dimnames(a) <- d
+  a
+}
+
+ +

With this function defined, an array may be printed in close format +using +

+
+
> no.dimnames(X)
+
+ +

This is particularly useful for large integer arrays, where patterns are +the real interest rather than the values. +

+
+ + + +

10.6.3 Recursive numerical integration

+ +

Functions may be recursive, and may themselves define functions within +themselves. Note, however, that such functions, or indeed variables, +are not inherited by called functions in higher evaluation frames as +they would be if they were on the search path. +

+

The example below shows a naive way of performing one-dimensional +numerical integration. The integrand is evaluated at the end points of +the range and in the middle. If the one-panel trapezium rule answer is +close enough to the two panel, then the latter is returned as the value. +Otherwise the same process is recursively applied to each panel. The +result is an adaptive integration process that concentrates function +evaluations in regions where the integrand is farthest from linear. +There is, however, a heavy overhead, and the function is only +competitive with other algorithms when the integrand is both smooth and +very difficult to evaluate. +

+

The example is also given partly as a little puzzle in R programming. +

+
+
area <- function(f, a, b, eps = 1.0e-06, lim = 10) {
+  fun1 <- function(f, a, b, fa, fb, a0, eps, lim, fun) {
+    ## function ‘fun1’ is only visible inside ‘area’
+    d <- (a + b)/2
+    h <- (b - a)/4
+    fd <- f(d)
+    a1 <- h * (fa + fd)
+    a2 <- h * (fd + fb)
+    if(abs(a0 - a1 - a2) < eps || lim == 0)
+      return(a1 + a2)
+    else {
+      return(fun(f, a, d, fa, fd, a1, eps, lim - 1, fun) +
+             fun(f, d, b, fd, fb, a2, eps, lim - 1, fun))
+    }
+  }
+  fa <- f(a)
+  fb <- f(b)
+  a0 <- ((fa + fb) * (b - a))/2
+  fun1(f, a, b, fa, fb, a0, eps, lim, fun1)
+}
+
+ + + + + + +
+ + + +

10.7 Scope

+ + +

The discussion in this section is somewhat more technical than in other +parts of this document. However, it details one of the major differences +between S-PLUS and R. +

+

The symbols which occur in the body of a function can be divided into +three classes; formal parameters, local variables and free variables. +The formal parameters of a function are those occurring in the argument +list of the function. Their values are determined by the process of +binding the actual function arguments to the formal parameters. +Local variables are those whose values are determined by the evaluation +of expressions in the body of the functions. Variables which are not +formal parameters or local variables are called free variables. Free +variables become local variables if they are assigned to. Consider the +following function definition. +

+
+
f <- function(x) {
+  y <- 2*x
+  print(x)
+  print(y)
+  print(z)
+}
+
+ +

In this function, x is a formal parameter, y is a local +variable and z is a free variable. +

+

In R the free variable bindings are resolved by first looking in the +environment in which the function was created. This is called +lexical scope. First we define a function called cube. +

+
+
cube <- function(n) {
+  sq <- function() n*n
+  n*sq()
+}
+
+ +

The variable n in the function sq is not an argument to that +function. Therefore it is a free variable and the scoping rules must be +used to ascertain the value that is to be associated with it. Under static +scope (S-PLUS) the value is that associated with a global variable named +n. Under lexical scope (R) it is the parameter to the function +cube since that is the active binding for the variable n at +the time the function sq was defined. The difference between +evaluation in R and evaluation in S-PLUS is that S-PLUS looks for a +global variable called n while R first looks for a variable +called n in the environment created when cube was invoked. +

+
+
## first evaluation in S
+S> cube(2)
+Error in sq(): Object "n" not found
+Dumped
+S> n <- 3
+S> cube(2)
+[1] 18
+## then the same function evaluated in R
+R> cube(2)
+[1] 8
+
+ +

Lexical scope can also be used to give functions mutable state. +In the following example we show how R can be used to mimic a bank +account. A functioning bank account needs to have a balance or total, a +function for making withdrawals, a function for making deposits and a +function for stating the current balance. We achieve this by creating +the three functions within account and then returning a list +containing them. When account is invoked it takes a numerical +argument total and returns a list containing the three functions. +Because these functions are defined in an environment which contains +total, they will have access to its value. +

+

The special assignment operator, <<-, + +is used to change the value associated with total. This operator +looks back in enclosing environments for an environment that contains +the symbol total and when it finds such an environment it +replaces the value, in that environment, with the value of right hand +side. If the global or top-level environment is reached without finding +the symbol total then that variable is created and assigned to +there. For most users <<- creates a global variable and assigns +the value of the right hand side to it23. Only when <<- has +been used in a function that was returned as the value of another +function will the special behavior described here occur. +

+
+
open.account <- function(total) {
+  list(
+    deposit = function(amount) {
+      if(amount <= 0)
+        stop("Deposits must be positive!\n")
+      total <<- total + amount
+      cat(amount, "deposited.  Your balance is", total, "\n\n")
+    },
+    withdraw = function(amount) {
+      if(amount > total)
+        stop("You don't have that much money!\n")
+      total <<- total - amount
+      cat(amount, "withdrawn.  Your balance is", total, "\n\n")
+    },
+    balance = function() {
+      cat("Your balance is", total, "\n\n")
+    }
+  )
+}
+
+ross <- open.account(100)
+robert <- open.account(200)
+
+ross$withdraw(30)
+ross$balance()
+robert$balance()
+
+ross$deposit(50)
+ross$balance()
+ross$withdraw(500)
+
+ +
+ + + +

10.8 Customizing the environment

+ + +

Users can customize their environment in several different ways. There +is a site initialization file and every directory can have its own +special initialization file. Finally, the special functions +.First and .Last can be used. +

+

The location of the site initialization file is taken from the value of +the R_PROFILE environment variable. If that variable is unset, +the file Rprofile.site in the R home subdirectory etc is +used. This file should contain the commands that you want to execute +every time R is started under your system. A second, personal, +profile file named .Rprofile24 can be placed in any directory. If R is invoked in that +directory then that file will be sourced. This file gives individual +users control over their workspace and allows for different startup +procedures in different working directories. If no .Rprofile +file is found in the startup directory, then R looks for a +.Rprofile file in the user’s home directory and uses that (if it +exists). If the environment variable R_PROFILE_USER is set, the +file it points to is used instead of the .Rprofile files. +

+

Any function named .First() in either of the two profile files or +in the .RData image has a special status. It is automatically +performed at the beginning of an R session and may be used to +initialize the environment. For example, the definition in the example +below alters the prompt to $ and sets up various other useful +things that can then be taken for granted in the rest of the session. +

+

Thus, the sequence in which files are executed is, Rprofile.site, +the user profile, .RData and then .First(). A definition +in later files will mask definitions in earlier files. +

+
+
> .First <- function() {
+  options(prompt="$ ", continue="+\t")  # $ is the prompt
+  options(digits=5, length=999)         # custom numbers and printout
+  x11()                                 # for graphics
+  par(pch = "+")                        # plotting character
+  source(file.path(Sys.getenv("HOME"), "R", "mystuff.R"))
+                                        # my personal functions
+  library(MASS)                         # attach a package
+}
+
+ + +

Similarly a function .Last(), if defined, is (normally) executed +at the very end of the session. An example is given below. +

+
+
> .Last <- function() {
+  graphics.off()                        # a small safety measure.
+  cat(paste(date(),"\nAdios\n"))        # Is it time for lunch?
+}
+
+ + +
+ + + +

10.9 Classes, generic functions and object orientation

+ + + + +

The class of an object determines how it will be treated by what are +known as generic functions. Put the other way round, a generic +function performs a task or action on its arguments specific to +the class of the argument itself. If the argument lacks any class +attribute, or has a class not catered for specifically by the generic +function in question, there is always a default action provided. +

+

An example makes things clearer. The class mechanism offers the user +the facility of designing and writing generic functions for special +purposes. Among the other generic functions are plot() for +displaying objects graphically, summary() for summarizing +analyses of various types, and anova() for comparing statistical +models. +

+

The number of generic functions that can treat a class in a specific way +can be quite large. For example, the functions that can accommodate in +some fashion objects of class "data.frame" include +

+
+
[     [[<-    any    as.matrix
+[<-   mean    plot   summary
+
+ + +

A currently complete list can be got by using the methods() +function: +

+
+
> methods(class="data.frame")
+
+ +

Conversely the number of classes a generic function can handle can also +be quite large. For example the plot() function has a default +method and variants for objects of classes "data.frame", +"density", "factor", and more. A complete list can be got +again by using the methods() function: +

+
+
> methods(plot)
+
+ +

For many generic functions the function body is quite short, for example +

+
+
> coef
+function (object, ...)
+UseMethod("coef")
+
+ +

The presence of UseMethod indicates this is a generic function. +To see what methods are available we can use methods() +

+
+
> methods(coef)
+[1] coef.aov*         coef.Arima*       coef.default*     coef.listof*
+[5] coef.nls*         coef.summary.nls*
+
+   Non-visible functions are asterisked
+
+ +

In this example there are six methods, none of which can be seen by +typing its name. We can read these by either of +

+ + +
+
> getAnywhere("coef.aov")
+A single object matching ‘coef.aov’ was found
+It was found in the following places
+  registered S3 method for coef from namespace stats
+  namespace:stats
+with value
+
+function (object, ...)
+{
+    z <- object$coef
+    z[!is.na(z)]
+}
+
+> getS3method("coef", "aov")
+function (object, ...)
+{
+    z <- object$coef
+    z[!is.na(z)]
+}
+
+ +

A function named gen.cl will be invoked by the +generic gen for class cl, so do not name +functions in this style unless they are intended to be methods. +

+

The reader is referred to the R Language Definition for a more +complete discussion of this mechanism. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

11 Statistical models in R

+ + +

This section presumes the reader has some familiarity with statistical +methodology, in particular with regression analysis and the analysis of +variance. Later we make some rather more ambitious presumptions, namely +that something is known about generalized linear models and nonlinear +regression. +

+

The requirements for fitting statistical models are sufficiently well +defined to make it possible to construct general tools that apply in a +broad spectrum of problems. +

+

R provides an interlocking suite of facilities that make fitting +statistical models very simple. As we mention in the introduction, the +basic output is minimal, and one needs to ask for the details by calling +extractor functions. +

+ + + + + + + + + + + +
+ + + +

11.1 Defining statistical models; formulae

+ + +

The template for a statistical model is a linear regression model with +independent, homoscedastic errors +

+
+
y_i = sum_{j=0}^p beta_j x_{ij} + e_i,     i = 1, …, n,
+
+

where the e_i are NID(0, sigma^2). +In matrix terms this would be written +

+
+
y = X  beta + e
+
+ +

where the y is the response vector, X is the model +matrix or design matrix and has columns +x_0, x_1, …, x_p, +the determining variables. Very often x_0 +will be a column of ones defining an intercept term. +

+ +

Examples

+ +

Before giving a formal specification, a few examples may usefully set +the picture. +

+

Suppose y, x, x0, x1, x2, … are +numeric variables, X is a matrix and A, B, +C, … are factors. The following formulae on the left +side below specify statistical models as described on the right. +

+
+
y ~ x
+
y ~ 1 + x
+

Both imply the same simple linear regression model of y on +x. The first has an implicit intercept term, and the second an +explicit one. +

+
+
y ~ 0 + x
+
y ~ -1 + x
+
y ~ x - 1
+

Simple linear regression of y on x through the origin +(that is, without an intercept term). +

+
+
log(y) ~ x1 + x2
+

Multiple regression of the transformed variable, +log(y), +on x1 and x2 (with an implicit intercept term). +

+
+
y ~ poly(x,2)
+
y ~ 1 + x + I(x^2)
+

Polynomial regression of y on x of degree 2. The first +form uses orthogonal polynomials, and the second uses explicit powers, +as basis. +

+
+
y ~ X + poly(x,2)
+

Multiple regression y with model matrix consisting of the matrix +X as well as polynomial terms in x to degree 2. +

+
+
y ~ A
+

Single classification analysis of variance model of y, with +classes determined by A. +

+
+
y ~ A + x
+

Single classification analysis of covariance model of y, with +classes determined by A, and with covariate x. +

+
+
y ~ A*B
+
y ~ A + B + A:B
+
y ~ B %in% A
+
y ~ A/B
+

Two factor non-additive model of y on A and B. The +first two specify the same crossed classification and the second two +specify the same nested classification. In abstract terms all four +specify the same model subspace. +

+
+
y ~ (A + B + C)^2
+
y ~ A*B*C - A:B:C
+

Three factor experiment but with a model containing main effects and two +factor interactions only. Both formulae specify the same model. +

+
+
y ~ A * x
+
y ~ A/x
+
y ~ A/(1 + x) - 1
+

Separate simple linear regression models of y on x within +the levels of A, with different codings. The last form produces +explicit estimates of as many different intercepts and slopes as there +are levels in A. +

+
+
y ~ A*B + Error(C)
+

An experiment with two treatment factors, A and B, and +error strata determined by factor C. For example a split plot +experiment, with whole plots (and hence also subplots), determined by +factor C. +

+
+ + +

The operator ~ is used to define a model formula in R. +The form, for an ordinary linear model, is +

+
+
response ~ op_1 term_1 op_2 term_2 op_3 term_3 
+
+ +

where +

+
+
response
+

is a vector or matrix, (or expression evaluating to a vector or matrix) +defining the response variable(s). +

+
op_i
+

is an operator, either + or -, implying the inclusion or +exclusion of a term in the model, (the first is optional). +

+
term_i
+

is either +

    +
  • a vector or matrix expression, or 1, +
  • a factor, or +
  • a formula expression consisting of factors, vectors or matrices +connected by formula operators. +
+

In all cases each term defines a collection of columns either to be +added to or removed from the model matrix. A 1 stands for an +intercept column and is by default included in the model matrix unless +explicitly removed. +

+
+
+ +

The formula operators are similar in effect to the Wilkinson and +Rogers notation used by such programs as Glim and Genstat. One +inevitable change is that the operator ‘.’ becomes +‘:’ since the period is a valid name character in R. +

+

The notation is summarized below (based on Chambers & Hastie, 1992, +p.29): +

+
+
Y ~ M
+

Y is modeled as M. +

+
+
M_1 + M_2
+

Include M_1 and M_2. +

+
+
M_1 - M_2
+

Include M_1 leaving out terms of M_2. +

+
+
M_1 : M_2
+

The tensor product of M_1 and M_2. If both terms are +factors, then the “subclasses” factor. +

+
+
M_1 %in% M_2
+

Similar to M_1:M_2, but with a different coding. +

+
+
M_1 * M_2
+

M_1 + M_2 + M_1:M_2. +

+
+
M_1 / M_2
+

M_1 + M_2 %in% M_1. +

+
+
M^n
+

All terms in M together with “interactions” up to order n +

+
+
I(M)
+

Insulate M. Inside M all operators have their normal +arithmetic meaning, and that term appears in the model matrix. +

+
+ +

Note that inside the parentheses that usually enclose function arguments +all operators have their normal arithmetic meaning. The function +I() is an identity function used to allow terms in model formulae +to be defined using arithmetic operators. +

+

Note particularly that the model formulae specify the columns +of the model matrix, the specification of the parameters being +implicit. This is not the case in other contexts, for example in +specifying nonlinear models. +

+ + + + +
+ + + +

11.1.1 Contrasts

+ + +

We need at least some idea how the model formulae specify the columns of +the model matrix. This is easy if we have continuous variables, as each +provides one column of the model matrix (and the intercept will provide +a column of ones if included in the model). +

+ + +

What about a k-level factor A? The answer differs for +unordered and ordered factors. For unordered factors k - +1 columns are generated for the indicators of the second, …, +kth levels of the factor. (Thus the implicit parameterization is +to contrast the response at each level with that at the first.) For +ordered factors the k - 1 columns are the orthogonal +polynomials on 1, …, k, omitting the constant term. +

+

Although the answer is already complicated, it is not the whole story. +First, if the intercept is omitted in a model that contains a factor +term, the first such term is encoded into k columns giving the +indicators for all the levels. Second, the whole behavior can be +changed by the options setting for contrasts. The default +setting in R is +

+
+
options(contrasts = c("contr.treatment", "contr.poly"))
+
+ +

The main reason for mentioning this is that R and S have +different defaults for unordered factors, S using Helmert +contrasts. So if you need to compare your results to those of a textbook +or paper which used S-PLUS, you will need to set +

+
+
options(contrasts = c("contr.helmert", "contr.poly"))
+
+ +

This is a deliberate difference, as treatment contrasts (R’s default) +are thought easier for newcomers to interpret. +

+

We have still not finished, as the contrast scheme to be used can be set +for each term in the model using the functions contrasts and +C. + + +

+

We have not yet considered interaction terms: these generate the +products of the columns introduced for their component terms. +

+

Although the details are complicated, model formulae in R will +normally generate the models that an expert statistician would expect, +provided that marginality is preserved. Fitting, for example, a model +with an interaction but not the corresponding main effects will in +general lead to surprising results, and is for experts only. +

+ +
+ + + +

11.2 Linear models

+ + +

The basic function for fitting ordinary multiple models is lm(), +and a streamlined version of the call is as follows: + +

+
+
> fitted.model <- lm(formula, data = data.frame)
+
+ +

For example +

+
+
> fm2 <- lm(y ~ x1 + x2, data = production)
+
+ +

would fit a multiple regression model of y on x1 and +x2 (with implicit intercept term). +

+

The important (but technically optional) parameter data = +production specifies that any variables needed to construct the model +should come first from the production data frame. +This is the case regardless of whether data frame +production has been attached on the search path or not. +

+
+ + + +

11.3 Generic functions for extracting model information

+ +

The value of lm() is a fitted model object; technically a list of +results of class "lm". Information about the fitted model can +then be displayed, extracted, plotted and so on by using generic +functions that orient themselves to objects of class "lm". These +include +

+
+
add1    deviance   formula      predict  step
+alias   drop1      kappa        print    summary
+anova   effects    labels       proj     vcov
+coef    family     plot         residuals
+
+ +

A brief description of the most commonly used ones is given below. +

+
+
+
+
anova(object_1, object_2)
+

Compare a submodel with an outer model and produce an analysis of +variance table. +

+ + +
+
coef(object)
+

Extract the regression coefficient (matrix). +

+

Long form: coefficients(object). +

+ +
+
deviance(object)
+

Residual sum of squares, weighted if appropriate. +

+ +
+
formula(object)
+

Extract the model formula. +

+ +
+
plot(object)
+

Produce four plots, showing residuals, fitted values and some +diagnostics. +

+ +
+
predict(object, newdata=data.frame)
+

The data frame supplied must have variables specified with the same +labels as the original. The value is a vector or matrix of predicted +values corresponding to the determining variable values in +data.frame. +

+ + +
+
print(object)
+

Print a concise version of the object. Most often used implicitly. +

+ + +
+
residuals(object)
+

Extract the (matrix of) residuals, weighted as appropriate. +

+

Short form: resid(object). +

+ +
+
step(object)
+

Select a suitable model by adding or dropping terms and preserving +hierarchies. The model with the smallest value of AIC (Akaike’s An +Information Criterion) discovered in the stepwise search is returned. +

+ +
+
summary(object)
+

Print a comprehensive summary of the results of the regression analysis. +

+ +
+
vcov(object)
+

Returns the variance-covariance matrix of the main parameters of a +fitted model object. +

+
+ +
+ + + +

11.4 Analysis of variance and model comparison

+ + +

The model fitting function aov(formula, +data=data.frame) + +operates at the simplest level in a very similar way to the function +lm(), and most of the generic functions listed in the table in +Generic functions for extracting model information apply. +

+

It should be noted that in addition aov() allows an analysis of +models with multiple error strata such as split plot experiments, or +balanced incomplete block designs with recovery of inter-block +information. The model formula +

+
+
response ~ mean.formula + Error(strata.formula)
+
+ + +

specifies a multi-stratum experiment with error strata defined by the +strata.formula. In the simplest case, strata.formula is +simply a factor, when it defines a two strata experiment, namely between +and within the levels of the factor. +

+

For example, with all determining variables factors, a model formula such +as that in: +

+
+
> fm <- aov(yield ~ v + n*p*k + Error(farms/blocks), data=farm.data)
+
+ +

would typically be used to describe an experiment with mean model +v + n*p*k and three error strata, namely “between farms”, +“within farms, between blocks” and “within blocks”. +

+ + + + +
+ + + +

11.4.1 ANOVA tables

+ +

Note also that the analysis of variance table (or tables) are for a +sequence of fitted models. The sums of squares shown are the decrease +in the residual sums of squares resulting from an inclusion of +that term in the model at that place in the sequence. +Hence only for orthogonal experiments will the order of inclusion be +inconsequential. +

+

For multistratum experiments the procedure is first to project the +response onto the error strata, again in sequence, and to fit the mean +model to each projection. For further details, see Chambers & Hastie +(1992). +

+

A more flexible alternative to the default full ANOVA table is to +compare two or more models directly using the anova() function. + +

+
+
> anova(fitted.model.1, fitted.model.2, …)
+
+ +

The display is then an ANOVA table showing the differences between the +fitted models when fitted in sequence. The fitted models being compared +would usually be an hierarchical sequence, of course. This does not +give different information to the default, but rather makes it easier to +comprehend and control. +

+
+ + + +

11.5 Updating fitted models

+ + +

The update() function is largely a convenience function that +allows a model to be fitted that differs from one previously fitted +usually by just a few additional or removed terms. Its form is + +

+
+
> new.model <- update(old.model, new.formula)
+
+ +

In the new.formula the special name consisting of a period, +‘.’, + +only, can be used to stand for “the corresponding part of the old model +formula”. For example, +

+
+
> fm05 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = production)
+> fm6  <- update(fm05, . ~ . + x6)
+> smf6 <- update(fm6, sqrt(.) ~ .)
+
+ +

would fit a five variate multiple regression with variables (presumably) +from the data frame production, fit an additional model including +a sixth regressor variable, and fit a variant on the model where the +response had a square root transform applied. +

+

Note especially that if the data= argument is specified on the +original call to the model fitting function, this information is passed on +through the fitted model object to update() and its allies. +

+

The name ‘.’ can also be used in other contexts, but with slightly +different meaning. For example +

+
+
> fmfull <- lm(y ~ . , data = production)
+
+ +

would fit a model with response y and regressor variables +all other variables in the data frame production. +

+

Other functions for exploring incremental sequences of models are +add1(), drop1() and step(). + + + +The names of these give a good clue to their purpose, but for full +details see the on-line help. +

+
+ + + +

11.6 Generalized linear models

+ + +

Generalized linear modeling is a development of linear models to +accommodate both non-normal response distributions and transformations +to linearity in a clean and straightforward way. A generalized linear +model may be described in terms of the following sequence of +assumptions: +

+
    +
  • There is a response, y, of interest and stimulus variables +x_1, x_2, …, +whose values influence the distribution of the response. + +
  • The stimulus variables influence the distribution of y through +a single linear function, only. This linear function is called +the linear predictor, and is usually written +
    +
    eta = beta_1 x_1 + beta_2 x_2 + … + beta_p x_p,
    +
    +

    hence x_i has no influence on the distribution of y if and only if +beta_i is zero. +

    +
  • The distribution of y is of the form +
    +
    f_Y(y; mu, phi)
    +  = exp((A/phi) * (y lambda(mu) - gamma(lambda(mu))) + tau(y, phi))
    +
    +

    where phi is a scale parameter (possibly known), and is constant +for all observations, A represents a prior weight, assumed known +but possibly varying with the observations, and $\mu$ is the mean of +y. +So it is assumed that the distribution of y is determined by its +mean and possibly a scale parameter as well. +

    +
  • The mean, mu, is a smooth invertible function of the linear predictor: +
    +
    mu = m(eta),    eta = m^{-1}(mu) = ell(mu)
    +
    +

    and this inverse function, ell(), is called the link function. +

    +
+ +

These assumptions are loose enough to encompass a wide class of models +useful in statistical practice, but tight enough to allow the +development of a unified methodology of estimation and inference, at +least approximately. The reader is referred to any of the current +reference works on the subject for full details, such as McCullagh & +Nelder (1989) or Dobson (1990). +

+ + + + + +
+ + + +

11.6.1 Families

+ + +

The class of generalized linear models handled by facilities supplied in +R includes gaussian, binomial, poisson, +inverse gaussian and gamma response distributions and also +quasi-likelihood models where the response distribution is not +explicitly specified. In the latter case the variance function +must be specified as a function of the mean, but in other cases this +function is implied by the response distribution. +

+

Each response distribution admits a variety of link functions to connect +the mean with the linear predictor. Those automatically available are +shown in the following table: +

+
+ + + + + + + + +
Family nameLink functions
binomiallogit, probit, log, cloglog
gaussianidentity, log, inverse
Gammaidentity, inverse, log
inverse.gaussian1/mu^2, identity, inverse, log
poissonidentity, log, sqrt
quasilogit, probit, cloglog, +identity, inverse, log, 1/mu^2, sqrt
+
+ +

The combination of a response distribution, a link function and various +other pieces of information that are needed to carry out the modeling +exercise is called the family of the generalized linear model. +

+
+ + + +

11.6.2 The glm() function

+ + +

Since the distribution of the response depends on the stimulus variables +through a single linear function only, the same mechanism as was +used for linear models can still be used to specify the linear part of a +generalized model. The family has to be specified in a different way. +

+

The R function to fit a generalized linear model is glm() +which uses the form +

+
+
> fitted.model <- glm(formula, family=family.generator, data=data.frame)
+
+ +

The only new feature is the family.generator, which is the +instrument by which the family is described. It is the name of a +function that generates a list of functions and expressions that +together define and control the model and estimation process. Although +this may seem a little complicated at first sight, its use is quite +simple. +

+

The names of the standard, supplied family generators are given under +“Family Name” in the table in Families. Where there is a choice +of links, the name of the link may also be supplied with the family +name, in parentheses as a parameter. In the case of the quasi +family, the variance function may also be specified in this way. +

+

Some examples make the process clear. +

+ +

The gaussian family

+ +

A call such as +

+
+
> fm <- glm(y ~ x1 + x2, family = gaussian, data = sales)
+
+ +

achieves the same result as +

+
+
> fm <- lm(y ~ x1+x2, data=sales)
+
+ +

but much less efficiently. Note how the gaussian family is not +automatically provided with a choice of links, so no parameter is +allowed. If a problem requires a gaussian family with a nonstandard +link, this can usually be achieved through the quasi family, as +we shall see later. +

+ +

The binomial family

+ +

Consider a small, artificial example, from Silvey (1970). +

+

On the Aegean island of Kalythos the male inhabitants suffer from a +congenital eye disease, the effects of which become more marked with +increasing age. Samples of islander males of various ages were tested +for blindness and the results recorded. The data is shown below: +

+ + + + +
Age:2035455570
No. tested:5050505050
No. blind: 617263744
+ +

The problem we consider is to fit both logistic and probit models to +this data, and to estimate for each model the LD50, that is the age at +which the chance of blindness for a male inhabitant is 50%. +

+

If y is the number of blind at age x and n the +number tested, both models have the form +y ~ B(n, F(beta_0 + beta_1 x)) +where for the probit case, +F(z) = Phi(z) +is the standard normal distribution function, and in the logit case +(the default), +F(z) = e^z/(1+e^z). +In both cases the LD50 is +LD50 = - beta_0/beta_1 +that is, the point at which the argument of the distribution function is +zero. +

+

The first step is to set the data up as a data frame +

+
+
> kalythos <- data.frame(x = c(20,35,45,55,70), n = rep(50,5),
+                         y = c(6,17,26,37,44))
+
+ +

To fit a binomial model using glm() there are three possibilities +for the response: +

+
    +
  • If the response is a vector it is assumed to hold binary +data, and so must be a 0/1 vector. + +
  • If the response is a two-column matrix it is assumed that the +first column holds the number of successes for the trial and the second +holds the number of failures. + +
  • If the response is a factor, its first level is taken as failure +(0) and all other levels as ‘success’ (1). +
+ +

Here we need the second of these conventions, so we add a matrix to our +data frame: +

+
+
> kalythos$Ymat <- cbind(kalythos$y, kalythos$n - kalythos$y)
+
+ +

To fit the models we use +

+
+
> fmp <- glm(Ymat ~ x, family = binomial(link=probit), data = kalythos)
+> fml <- glm(Ymat ~ x, family = binomial, data = kalythos)
+
+ +

Since the logit link is the default the parameter may be omitted on the +second call. To see the results of each fit we could use +

+
+
> summary(fmp)
+> summary(fml)
+
+ +

Both models fit (all too) well. To find the LD50 estimate we can use a +simple function: +

+
+
> ld50 <- function(b) -b[1]/b[2]
+> ldp <- ld50(coef(fmp)); ldl <- ld50(coef(fml)); c(ldp, ldl)
+
+ +

The actual estimates from this data are 43.663 years and 43.601 years +respectively. +

+ +

Poisson models

+ +

With the Poisson family the default link is the log, and in +practice the major use of this family is to fit surrogate Poisson +log-linear models to frequency data, whose actual distribution is often +multinomial. This is a large and important subject we will not discuss +further here. It even forms a major part of the use of non-gaussian +generalized models overall. +

+

Occasionally genuinely Poisson data arises in practice and in the past +it was often analyzed as gaussian data after either a log or a +square-root transformation. As a graceful alternative to the latter, a +Poisson generalized linear model may be fitted as in the following +example: +

+
+
> fmod <- glm(y ~ A + B + x, family = poisson(link=sqrt),
+              data = worm.counts)
+
+ + +

Quasi-likelihood models

+ +

For all families the variance of the response will depend on the mean +and will have the scale parameter as a multiplier. The form of +dependence of the variance on the mean is a characteristic of the +response distribution; for example for the poisson distribution +Var(y) = mu. +

+

For quasi-likelihood estimation and inference the precise response +distribution is not specified, but rather only a link function and the +form of the variance function as it depends on the mean. Since +quasi-likelihood estimation uses formally identical techniques to those +for the gaussian distribution, this family provides a way of fitting +gaussian models with non-standard link functions or variance functions, +incidentally. +

+

For example, consider fitting the non-linear regression +y = theta_1 z_1 / (z_2 - theta_2) + e +which may be written alternatively as +y = 1 / (beta_1 x_1 + beta_2 x_2) + e +where +x_1 = z_2/z_1, x_2 = -1/z_1, beta_1 = 1/theta_1, and beta_2 = +theta_2/theta_1. +Supposing a suitable data frame to be set up we could fit this +non-linear regression as +

+
+
> nlfit <- glm(y ~ x1 + x2 - 1,
+               family = quasi(link=inverse, variance=constant),
+               data = biochem)
+
+ +

The reader is referred to the manual and the help document for further +information, as needed. +

+
+ + + +

11.7 Nonlinear least squares and maximum likelihood models

+ + +

Certain forms of nonlinear model can be fitted by Generalized Linear +Models (glm()). But in the majority of cases we have to approach +the nonlinear curve fitting problem as one of nonlinear optimization. +R’s nonlinear optimization routines are optim(), nlm() +and nlminb(), + + + +which provide the functionality (and more) of S-PLUS’s ms() and +nlminb(). We seek the parameter values that minimize some index +of lack-of-fit, and they do this by trying out various parameter values +iteratively. Unlike linear regression for example, there is no +guarantee that the procedure will converge on satisfactory estimates. +All the methods require initial guesses about what parameter values to +try, and convergence may depend critically upon the quality of the +starting values. +

+ + + + + +
+ + + +

11.7.1 Least squares

+ +

One way to fit a nonlinear model is by minimizing the sum of the squared +errors (SSE) or residuals. This method makes sense if the observed +errors could have plausibly arisen from a normal distribution. +

+

Here is an example from Bates & Watts (1988), page 51. The data are: +

+
+
> x <- c(0.02, 0.02, 0.06, 0.06, 0.11, 0.11, 0.22, 0.22, 0.56, 0.56,
+         1.10, 1.10)
+> y <- c(76, 47, 97, 107, 123, 139, 159, 152, 191, 201, 207, 200)
+
+ +

The fit criterion to be minimized is: +

+
+
> fn <- function(p) sum((y - (p[1] * x)/(p[2] + x))^2)
+
+ +

In order to do the fit we need initial estimates of the parameters. One +way to find sensible starting values is to plot the data, guess some +parameter values, and superimpose the model curve using those values. +

+
+
> plot(x, y)
+> xfit <- seq(.02, 1.1, .05)
+> yfit <- 200 * xfit/(0.1 + xfit)
+> lines(spline(xfit, yfit))
+
+ +

We could do better, but these starting values of 200 and 0.1 seem +adequate. Now do the fit: +

+
+
> out <- nlm(fn, p = c(200, 0.1), hessian = TRUE)
+
+ + +

After the fitting, out$minimum is the SSE, and +out$estimate are the least squares estimates of the parameters. +To obtain the approximate standard errors (SE) of the estimates we do: +

+
+
> sqrt(diag(2*out$minimum/(length(y) - 2) * solve(out$hessian)))
+
+ +

The 2 which is subtracted in the line above represents the number +of parameters. A 95% confidence interval would be the parameter +estimate +/- 1.96 SE. We can superimpose the least squares +fit on a new plot: +

+
+
> plot(x, y)
+> xfit <- seq(.02, 1.1, .05)
+> yfit <- 212.68384222 * xfit/(0.06412146 + xfit)
+> lines(spline(xfit, yfit))
+
+ +

The standard package stats provides much more extensive facilities +for fitting non-linear models by least squares. The model we have just +fitted is the Michaelis-Menten model, so we can use +

+
+
> df <- data.frame(x=x, y=y)
+> fit <- nls(y ~ SSmicmen(x, Vm, K), df)
+> fit
+Nonlinear regression model
+  model:  y ~ SSmicmen(x, Vm, K)
+   data:  df
+          Vm            K
+212.68370711   0.06412123
+ residual sum-of-squares:  1195.449
+> summary(fit)
+
+Formula: y ~ SSmicmen(x, Vm, K)
+
+Parameters:
+    Estimate Std. Error t value Pr(>|t|)
+Vm 2.127e+02  6.947e+00  30.615 3.24e-11
+K  6.412e-02  8.281e-03   7.743 1.57e-05
+
+Residual standard error: 10.93 on 10 degrees of freedom
+
+Correlation of Parameter Estimates:
+      Vm
+K 0.7651
+
+ +
+ + + +

11.7.2 Maximum likelihood

+ + +

Maximum likelihood is a method of nonlinear model fitting that applies +even if the errors are not normal. The method finds the parameter values +which maximize the log likelihood, or equivalently which minimize the +negative log-likelihood. Here is an example from Dobson (1990), pp. +108–111. This example fits a logistic model to dose-response data, +which clearly could also be fit by glm(). The data are: +

+
+
> x <- c(1.6907, 1.7242, 1.7552, 1.7842, 1.8113,
+         1.8369, 1.8610, 1.8839)
+> y <- c( 6, 13, 18, 28, 52, 53, 61, 60)
+> n <- c(59, 60, 62, 56, 63, 59, 62, 60)
+
+ +

The negative log-likelihood to minimize is: +

+
+
> fn <- function(p)
+   sum( - (y*(p[1]+p[2]*x) - n*log(1+exp(p[1]+p[2]*x))
+           + log(choose(n, y)) ))
+
+ +

We pick sensible starting values and do the fit: +

+
+
> out <- nlm(fn, p = c(-50,20), hessian = TRUE)
+
+ + +

After the fitting, out$minimum is the negative log-likelihood, +and out$estimate are the maximum likelihood estimates of the +parameters. To obtain the approximate SEs of the estimates we do: +

+
+
> sqrt(diag(solve(out$hessian)))
+
+ +

A 95% confidence interval would be the parameter estimate +/- +1.96 SE. +

+
+ + + +

11.8 Some non-standard models

+ +

We conclude this chapter with just a brief mention of some of the other +facilities available in R for special regression and data analysis +problems. +

+
    +
  • +Mixed models. The recommended nlme package provides +functions lme() and nlme() + + +for linear and non-linear mixed-effects models, that is linear and +non-linear regressions in which some of the coefficients correspond to +random effects. These functions make heavy use of formulae to specify +the models. + +
  • +Local approximating regressions. The loess() + +function fits a nonparametric regression by using a locally weighted +regression. Such regressions are useful for highlighting a trend in +messy data or for data reduction to give some insight into a large data +set. + +

    Function loess is in the standard package stats, together +with code for projection pursuit regression. + +

    +
  • +Robust regression. There are several functions available for +fitting regression models in a way resistant to the influence of extreme +outliers in the data. Function lqs + +in the recommended package MASS provides state-of-art algorithms +for highly-resistant fits. Less resistant but statistically more +efficient methods are available in packages, for example function +rlm + +in package MASS. + +
  • +Additive models. This technique aims to construct a regression +function from smooth additive functions of the determining variables, +usually one for each determining variable. Functions avas and +ace + + +in package acepack and functions bruto and mars + + +in package mda provide some examples of these techniques in +user-contributed packages to R. An extension is Generalized +Additive Models, implemented in user-contributed packages gam and +mgcv. + +
  • +Tree-based models. Rather than seek an explicit global linear +model for prediction or interpretation, tree-based models seek to +bifurcate the data, recursively, at critical points of the determining +variables in order to partition the data ultimately into groups that are +as homogeneous as possible within, and as heterogeneous as possible +between. The results often lead to insights that other data analysis +methods tend not to yield. + +

    Models are again specified in the ordinary linear model form. The model +fitting function is tree(), + +but many other generic functions such as plot() and text() +are well adapted to displaying the results of a tree-based model fit in +a graphical way. +

    +

    Tree models are available in R via the user-contributed +packages rpart and tree. +

    +
+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

12 Graphical procedures

+ +

Graphical facilities are an important and extremely versatile component +of the R environment. It is possible to use the facilities to +display a wide variety of statistical graphs and also to build entirely +new types of graph. +

+

The graphics facilities can be used in both interactive and batch modes, +but in most cases, interactive use is more productive. Interactive use +is also easy because at startup time R initiates a graphics +device driver which opens a special graphics window for +the display of interactive graphics. Although this is done +automatically, it may useful to know that the command used is +X11() under UNIX, windows() under Windows and +quartz() under OS X. A new device can always be opened by +dev.new(). +

+

Once the device driver is running, R plotting commands can be used to +produce a variety of graphical displays and to create entirely new kinds +of display. +

+

Plotting commands are divided into three basic groups: +

+
    +
  • High-level plotting functions create a new plot on the graphics +device, possibly with axes, labels, titles and so on. +
  • Low-level plotting functions add more information to an +existing plot, such as extra points, lines and labels. +
  • Interactive graphics functions allow you interactively add +information to, or extract information from, an existing plot, using a +pointing device such as a mouse. +
+ +

In addition, R maintains a list of graphical parameters which +can be manipulated to customize your plots. +

+

This manual only describes what are known as ‘base’ graphics. A +separate graphics sub-system in package grid coexists with base – +it is more powerful but harder to use. There is a recommended package +lattice which builds on grid and provides ways to produce +multi-panel plots akin to those in the Trellis system in S. +

+ + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Graphics   [Contents][Index]

+
+ +

12.1 High-level plotting commands

+ +

High-level plotting functions are designed to generate a complete plot +of the data passed as arguments to the function. Where appropriate, +axes, labels and titles are automatically generated (unless you request +otherwise.) High-level plotting commands always start a new plot, +erasing the current plot if necessary. +

+ + + + + + + +
+ + + +

12.1.1 The plot() function

+ + +

One of the most frequently used plotting functions in R is the +plot() function. This is a generic function: the type of +plot produced is dependent on the type or class of the first +argument. +

+
+
plot(x, y)
+
plot(xy)
+

If x and y are vectors, plot(x, y) +produces a scatterplot of y against x. The same effect can +be produced by supplying one argument (second form) as either a list +containing two elements x and y or a two-column matrix. +

+
+
plot(x)
+

If x is a time series, this produces a time-series plot. If +x is a numeric vector, it produces a plot of the values in the +vector against their index in the vector. If x is a complex +vector, it produces a plot of imaginary versus real parts of the vector +elements. +

+
+
plot(f)
+
plot(f, y)
+

f is a factor object, y is a numeric vector. The first form +generates a bar plot of f; the second form produces boxplots of +y for each level of f. +

+
+
plot(df)
+
plot(~ expr)
+
plot(y ~ expr)
+

df is a data frame, y is any object, expr is a list +of object names separated by ‘+’ (e.g., a + b + c). The +first two forms produce distributional plots of the variables in a data +frame (first form) or of a number of named objects (second form). The +third form plots y against every object named in expr. +

+
+ +
+ + + +

12.1.2 Displaying multivariate data

+ +

R provides two very useful functions for representing multivariate +data. If X is a numeric matrix or data frame, the command +

+
+
> pairs(X)
+
+ + +

produces a pairwise scatterplot matrix of the variables defined by the +columns of X, that is, every column of X is plotted +against every other column of X and the resulting n(n-1) +plots are arranged in a matrix with plot scales constant over the rows +and columns of the matrix. +

+

When three or four variables are involved a coplot may be more +enlightening. If a and b are numeric vectors and c +is a numeric vector or factor object (all of the same length), then +the command +

+
+
> coplot(a ~ b | c)
+
+ + +

produces a number of scatterplots of a against b for given +values of c. If c is a factor, this simply means that +a is plotted against b for every level of c. When +c is numeric, it is divided into a number of conditioning +intervals and for each interval a is plotted against b +for values of c within the interval. The number and position of +intervals can be controlled with given.values= argument to +coplot()—the function co.intervals() is useful for +selecting intervals. You can also use two given variables with a +command like +

+
+
> coplot(a ~ b | c + d)
+
+ +

which produces scatterplots of a against b for every joint +conditioning interval of c and d. +

+

The coplot() and pairs() function both take an argument +panel= which can be used to customize the type of plot which +appears in each panel. The default is points() to produce a +scatterplot but by supplying some other low-level graphics function of +two vectors x and y as the value of panel= you can +produce any type of plot you wish. An example panel function useful for +coplots is panel.smooth(). +

+
+ + + +

12.1.3 Display graphics

+ +

Other high-level graphics functions produce different types of plots. +Some examples are: +

+
+
qqnorm(x)
+
qqline(x)
+
qqplot(x, y)
+
+ + +

Distribution-comparison plots. The first form plots the numeric vector +x against the expected Normal order scores (a normal scores plot) +and the second adds a straight line to such a plot by drawing a line +through the distribution and data quartiles. The third form plots the +quantiles of x against those of y to compare their +respective distributions. +

+
+
hist(x)
+
hist(x, nclass=n)
+
hist(x, breaks=b, …)
+
+

Produces a histogram of the numeric vector x. A sensible number +of classes is usually chosen, but a recommendation can be given with the +nclass= argument. Alternatively, the breakpoints can be +specified exactly with the breaks= argument. If the +probability=TRUE argument is given, the bars represent relative +frequencies divided by bin width instead of counts. +

+
+
dotchart(x, …)
+
+

Constructs a dotchart of the data in x. In a dotchart the +y-axis gives a labelling of the data in x and the +x-axis gives its value. For example it allows easy visual +selection of all data entries with values lying in specified ranges. +

+
+
image(x, y, z, …)
+
contour(x, y, z, …)
+
persp(x, y, z, …)
+
+ + +

Plots of three variables. The image plot draws a grid of rectangles +using different colours to represent the value of z, the contour +plot draws contour lines to represent the value of z, and the +persp plot draws a 3D surface. +

+
+ +
+ + + +

12.1.4 Arguments to high-level plotting functions

+ +

There are a number of arguments which may be passed to high-level +graphics functions, as follows: +

+
+
add=TRUE
+

Forces the function to act as a low-level graphics function, +superimposing the plot on the current plot (some functions only). +

+
+
axes=FALSE
+

Suppresses generation of axes—useful for adding your own custom axes +with the axis() function. The default, axes=TRUE, means +include axes. +

+
+
log="x"
+
log="y"
+
log="xy"
+

Causes the x, y or both axes to be logarithmic. This will +work for many, but not all, types of plot. +

+
+
type=
+

The type= argument controls the type of plot produced, as +follows: +

+
+
type="p"
+

Plot individual points (the default) +

+
type="l"
+

Plot lines +

+
type="b"
+

Plot points connected by lines (both) +

+
type="o"
+

Plot points overlaid by lines +

+
type="h"
+

Plot vertical lines from points to the zero axis (high-density) +

+
type="s"
+
type="S"
+

Step-function plots. In the first form, the top of the vertical defines +the point; in the second, the bottom. +

+
type="n"
+

No plotting at all. However axes are still drawn (by default) and the +coordinate system is set up according to the data. Ideal for creating +plots with subsequent low-level graphics functions. +

+
+ +
+
xlab=string
+
ylab=string
+

Axis labels for the x and y axes. Use these arguments to +change the default labels, usually the names of the objects used in the +call to the high-level plotting function. +

+
+
main=string
+

Figure title, placed at the top of the plot in a large font. +

+
+
sub=string
+

Sub-title, placed just below the x-axis in a smaller font. +

+
+ +
+ + + +

12.2 Low-level plotting commands

+ +

Sometimes the high-level plotting functions don’t produce exactly the +kind of plot you desire. In this case, low-level plotting commands can +be used to add extra information (such as points, lines or text) to the +current plot. +

+

Some of the more useful low-level plotting functions are: +

+
+
points(x, y)
+
lines(x, y)
+
+ +

Adds points or connected lines to the current plot. plot()’s +type= argument can also be passed to these functions (and +defaults to "p" for points() and "l" for +lines().) +

+
+
text(x, y, labels, …)
+
+

Add text to a plot at points given by x, y. Normally +labels is an integer or character vector in which case +labels[i] is plotted at point (x[i], y[i]). The default +is 1:length(x). +

+

Note: This function is often used in the sequence +

+
+
> plot(x, y, type="n"); text(x, y, names)
+
+ +

The graphics parameter type="n" suppresses the points but sets up +the axes, and the text() function supplies special characters, as +specified by the character vector names for the points. +

+
+
abline(a, b)
+
abline(h=y)
+
abline(v=x)
+
abline(lm.obj)
+
+

Adds a line of slope b and intercept a to the current +plot. h=y may be used to specify y-coordinates for +the heights of horizontal lines to go across a plot, and +v=x similarly for the x-coordinates for vertical +lines. Also lm.obj may be list with a coefficients +component of length 2 (such as the result of model-fitting functions,) +which are taken as an intercept and slope, in that order. +

+
+
polygon(x, y, …)
+
+

Draws a polygon defined by the ordered vertices in (x, y) +and (optionally) shade it in with hatch lines, or fill it if the +graphics device allows the filling of figures. +

+
+
legend(x, y, legend, …)
+
+

Adds a legend to the current plot at the specified position. Plotting +characters, line styles, colors etc., are identified with the labels in +the character vector legend. At least one other argument v +(a vector the same length as legend) with the corresponding +values of the plotting unit must also be given, as follows: +

+
+
legend( , fill=v)
+

Colors for filled boxes +

+
legend( , col=v)
+

Colors in which points or lines will be drawn +

+
legend( , lty=v)
+

Line styles +

+
legend( , lwd=v)
+

Line widths +

+
legend( , pch=v)
+

Plotting characters (character vector) +

+
+ +
+
title(main, sub)
+
+

Adds a title main to the top of the current plot in a large font +and (optionally) a sub-title sub at the bottom in a smaller font. +

+
+
axis(side, …)
+
+

Adds an axis to the current plot on the side given by the first argument +(1 to 4, counting clockwise from the bottom.) Other arguments control +the positioning of the axis within or beside the plot, and tick +positions and labels. Useful for adding custom axes after calling +plot() with the axes=FALSE argument. +

+
+ +

Low-level plotting functions usually require some positioning +information (e.g., x and y coordinates) to determine where +to place the new plot elements. Coordinates are given in terms of +user coordinates which are defined by the previous high-level +graphics command and are chosen based on the supplied data. +

+

Where x and y arguments are required, it is also +sufficient to supply a single argument being a list with elements named +x and y. Similarly a matrix with two columns is also +valid input. In this way functions such as locator() (see below) +may be used to specify positions on a plot interactively. +

+ + + + + +
+ + + +

12.2.1 Mathematical annotation

+ +

In some cases, it is useful to add mathematical symbols and formulae to a +plot. This can be achieved in R by specifying an expression rather +than a character string in any one of text, mtext, axis, +or title. For example, the following code draws the formula for +the Binomial probability function: +

+
+
> text(x, y, expression(paste(bgroup("(", atop(n, x), ")"), p^x, q^{n-x})))
+
+ +

More information, including a full listing of the features available can +obtained from within R using the commands: +

+
+
> help(plotmath)
+> example(plotmath)
+> demo(plotmath)
+
+ +
+ + + +

12.2.2 Hershey vector fonts

+ +

It is possible to specify Hershey vector fonts for rendering text when using +the text and contour functions. There are three reasons for +using the Hershey fonts: +

    +
  • Hershey fonts can produce better +output, especially on a computer screen, for rotated and/or small text. +
  • Hershey fonts +provide certain symbols that may not be available +in the standard fonts. In particular, there are zodiac signs, cartographic +symbols and astronomical symbols. +
  • Hershey fonts provide cyrillic and japanese (Kana and Kanji) characters. +
+ +

More information, including tables of Hershey characters can be obtained from +within R using the commands: +

+
+
> help(Hershey)
+> demo(Hershey)
+> help(Japanese)
+> demo(Japanese)
+
+ +
+ + + +

12.3 Interacting with graphics

+ +

R also provides functions which allow users to extract or add +information to a plot using a mouse. The simplest of these is the +locator() function: +

+
+
locator(n, type)
+
+

Waits for the user to select locations on the current plot using the +left mouse button. This continues until n (default 512) points +have been selected, or another mouse button is pressed. The +type argument allows for plotting at the selected points and has +the same effect as for high-level graphics commands; the default is no +plotting. locator() returns the locations of the points selected +as a list with two components x and y. +

+
+ +

locator() is usually called with no arguments. It is +particularly useful for interactively selecting positions for graphic +elements such as legends or labels when it is difficult to calculate in +advance where the graphic should be placed. For example, to place some +informative text near an outlying point, the command +

+
+
> text(locator(1), "Outlier", adj=0)
+
+ +

may be useful. (locator() will be ignored if the current device, +such as postscript does not support interactive pointing.) +

+
+
identify(x, y, labels)
+
+

Allow the user to highlight any of the points defined by x and +y (using the left mouse button) by plotting the corresponding +component of labels nearby (or the index number of the point if +labels is absent). Returns the indices of the selected points +when another button is pressed. +

+
+ +

Sometimes we want to identify particular points on a plot, rather +than their positions. For example, we may wish the user to select some +observation of interest from a graphical display and then manipulate +that observation in some way. Given a number of (x, y) +coordinates in two numeric vectors x and y, we could use +the identify() function as follows: +

+
+
> plot(x, y)
+> identify(x, y)
+
+ +

The identify() functions performs no plotting itself, but simply +allows the user to move the mouse pointer and click the left mouse +button near a point. If there is a point near the mouse pointer it will +be marked with its index number (that is, its position in the +x/y vectors) plotted nearby. Alternatively, you could use +some informative string (such as a case name) as a highlight by using +the labels argument to identify(), or disable marking +altogether with the plot = FALSE argument. When the process is +terminated (see above), identify() returns the indices of the +selected points; you can use these indices to extract the selected +points from the original vectors x and y. +

+
+ + + +

12.4 Using graphics parameters

+ +

When creating graphics, particularly for presentation or publication +purposes, R’s defaults do not always produce exactly that which is +required. You can, however, customize almost every aspect of the +display using graphics parameters. R maintains a list of a +large number of graphics parameters which control things such as line +style, colors, figure arrangement and text justification among many +others. Every graphics parameter has a name (such as ‘col’, +which controls colors,) and a value (a color number, for example.) +

+

A separate list of graphics parameters is maintained for each active +device, and each device has a default set of parameters when +initialized. Graphics parameters can be set in two ways: either +permanently, affecting all graphics functions which access the current +device; or temporarily, affecting only a single graphics function call. +

+ + + + + +
+ + + +

12.4.1 Permanent changes: The par() function

+ + + +

The par() function is used to access and modify the list of +graphics parameters for the current graphics device. +

+
+
par()
+

Without arguments, returns a list of all graphics parameters and their +values for the current device. +

+
par(c("col", "lty"))
+

With a character vector argument, returns only the named graphics +parameters (again, as a list.) +

+
par(col=4, lty=2)
+

With named arguments (or a single list argument), sets the values of +the named graphics parameters, and returns the original values of the +parameters as a list. +

+
+ +

Setting graphics parameters with the par() function changes the +value of the parameters permanently, in the sense that all future +calls to graphics functions (on the current device) will be affected by +the new value. You can think of setting graphics parameters in this way +as setting “default” values for the parameters, which will be used by +all graphics functions unless an alternative value is given. +

+

Note that calls to par() always affect the global values +of graphics parameters, even when par() is called from within a +function. This is often undesirable behavior—usually we want to set +some graphics parameters, do some plotting, and then restore the +original values so as not to affect the user’s R session. You can +restore the initial values by saving the result of par() when +making changes, and restoring the initial values when plotting is +complete. +

+
+
> oldpar <- par(col=4, lty=2)
+  … plotting commands …
+> par(oldpar)
+
+ +

To save and restore all settable25 graphical parameters use +

+
+
> oldpar <- par(no.readonly=TRUE)
+  … plotting commands …
+> par(oldpar)
+
+ + +
+ + + +

12.4.2 Temporary changes: Arguments to graphics functions

+ +

Graphics parameters may also be passed to (almost) any graphics function +as named arguments. This has the same effect as passing the arguments +to the par() function, except that the changes only last for the +duration of the function call. For example: +

+
+
> plot(x, y, pch="+")
+
+ +

produces a scatterplot using a plus sign as the plotting character, +without changing the default plotting character for future plots. +

+

Unfortunately, this is not implemented entirely consistently and it is +sometimes necessary to set and reset graphics parameters using +par(). +

+ +
+ + + +

12.5 Graphics parameters list

+ +

The following sections detail many of the commonly-used graphical +parameters. The R help documentation for the par() function +provides a more concise summary; this is provided as a somewhat more +detailed alternative. +

+

Graphics parameters will be presented in the following form: +

+
+
name=value
+

A description of the parameter’s effect. name is the name of the +parameter, that is, the argument name to use in calls to par() or +a graphics function. value is a typical value you might use when +setting the parameter. +

+
+ +

Note that axes is not a graphics parameter but an +argument to a few plot methods: see xaxt and yaxt. +

+ + + + + + + +
+ + + +

12.5.1 Graphical elements

+ +

R plots are made up of points, lines, text and polygons (filled +regions.) Graphical parameters exist which control how these +graphical elements are drawn, as follows: +

+
+
pch="+"
+

Character to be used for plotting points. The default varies with +graphics drivers, but it is usually +a circle. +Plotted points tend to appear slightly above or below the appropriate +position unless you use "." as the plotting character, which +produces centered points. +

+
+
pch=4
+

When pch is given as an integer between 0 and 25 inclusive, a +specialized plotting symbol is produced. To see what the symbols are, +use the command +

+
+
> legend(locator(1), as.character(0:25), pch = 0:25)
+
+ +

Those from 21 to 25 may appear to duplicate earlier symbols, but can be +coloured in different ways: see the help on points and its +examples. +

+

In addition, pch can be a character or a number in the range +32:255 representing a character in the current font. +

+
+
lty=2
+

Line types. Alternative line styles are not supported on all graphics +devices (and vary on those that do) but line type 1 is always a solid +line, line type 0 is always invisible, and line types 2 and onwards are +dotted or dashed lines, or some combination of both. +

+
+
lwd=2
+

Line widths. Desired width of lines, in multiples of the “standard” +line width. Affects axis lines as well as lines drawn with +lines(), etc. Not all devices support this, and some have +restrictions on the widths that can be used. +

+
+
col=2
+

Colors to be used for points, lines, text, filled regions and images. +A number from the current palette (see ?palette) or a named colour. +

+
+
col.axis
+
col.lab
+
col.main
+
col.sub
+

The color to be used for axis annotation, x and y labels, +main and sub-titles, respectively. +

+
+
font=2
+

An integer which specifies which font to use for text. If possible, +device drivers arrange so that 1 corresponds to plain text, +2 to bold face, 3 to italic, 4 to bold italic +and 5 to a symbol font (which include Greek letters). +

+
+
font.axis
+
font.lab
+
font.main
+
font.sub
+

The font to be used for axis annotation, x and y labels, +main and sub-titles, respectively. +

+
+
adj=-0.1
+

Justification of text relative to the plotting position. 0 means +left justify, 1 means right justify and 0.5 means to +center horizontally about the plotting position. The actual value is +the proportion of text that appears to the left of the plotting +position, so a value of -0.1 leaves a gap of 10% of the text width +between the text and the plotting position. +

+
+
cex=1.5
+

Character expansion. The value is the desired size of text characters +(including plotting characters) relative to the default text size. +

+
+
cex.axis
+
cex.lab
+
cex.main
+
cex.sub
+

The character expansion to be used for axis annotation, x and +y labels, main and sub-titles, respectively. +

+
+ +
+ + + +

12.5.2 Axes and tick marks

+ +

Many of R’s high-level plots have axes, and you can construct axes +yourself with the low-level axis() graphics function. Axes have +three main components: the axis line (line style controlled by the +lty graphics parameter), the tick marks (which mark off unit +divisions along the axis line) and the tick labels (which mark the +units.) These components can be customized with the following graphics +parameters. +

+
+
lab=c(5, 7, 12)
+

The first two numbers are the desired number of tick intervals on the +x and y axes respectively. The third number is the +desired length of axis labels, in characters (including the decimal +point.) Choosing a too-small value for this parameter may result in all +tick labels being rounded to the same number! +

+
+
las=1
+

Orientation of axis labels. 0 means always parallel to axis, +1 means always horizontal, and 2 means always +perpendicular to the axis. +

+
+
mgp=c(3, 1, 0)
+

Positions of axis components. The first component is the distance from +the axis label to the axis position, in text lines. The second +component is the distance to the tick labels, and the final component is +the distance from the axis position to the axis line (usually zero). +Positive numbers measure outside the plot region, negative numbers +inside. +

+
+
tck=0.01
+

Length of tick marks, as a fraction of the size of the plotting region. +When tck is small (less than 0.5) the tick marks on the x +and y axes are forced to be the same size. A value of 1 gives +grid lines. Negative values give tick marks outside the plotting +region. Use tck=0.01 and mgp=c(1,-1.5,0) for internal +tick marks. +

+
+
xaxs="r"
+
yaxs="i"
+

Axis styles for the x and y axes, respectively. With +styles "i" (internal) and "r" (the default) tick marks +always fall within the range of the data, however style "r" +leaves a small amount of space at the edges. (S has other styles +not implemented in R.) +

+
+
+ +
+ + + +

12.5.3 Figure margins

+ + +

A single plot in R is known as a figure and comprises a +plot region surrounded by margins (possibly containing axis +labels, titles, etc.) and (usually) bounded by the axes themselves. +

+

A typical figure is +

+images/fig11 + +

Graphics parameters controlling figure layout include: +

+
+
mai=c(1, 0.5, 0.5, 0)
+

Widths of the bottom, left, top and right margins, respectively, +measured in inches. +

+
+
mar=c(4, 2, 2, 1)
+

Similar to mai, except the measurement unit is text lines. +

+
+ +

mar and mai are equivalent in the sense that setting one +changes the value of the other. The default values chosen for this +parameter are often too large; the right-hand margin is rarely needed, +and neither is the top margin if no title is being used. The bottom and +left margins must be large enough to accommodate the axis and tick +labels. Furthermore, the default is chosen without regard to the size +of the device surface: for example, using the postscript() driver +with the height=4 argument will result in a plot which is about +50% margin unless mar or mai are set explicitly. When +multiple figures are in use (see below) the margins are reduced, however +this may not be enough when many figures share the same page. +

+
+ + + +

12.5.4 Multiple figure environment

+ +

R allows you to create an n by m array of figures on a +single page. Each figure has its own margins, and the array of figures +is optionally surrounded by an outer margin, as shown in the +following figure. +

+images/fig12 + +

The graphical parameters relating to multiple figures are as follows: +

+
+
mfcol=c(3, 2)
+
mfrow=c(2, 4)
+

Set the size of a multiple figure array. The first value is the number of +rows; the second is the number of columns. The only difference between +these two parameters is that setting mfcol causes figures to be +filled by column; mfrow fills by rows. +

+

The layout in the Figure could have been created by setting +mfrow=c(3,2); the figure shows the page after four plots have +been drawn. +

+

Setting either of these can reduce the base size of symbols and text +(controlled by par("cex") and the pointsize of the device). In a +layout with exactly two rows and columns the base size is reduced by a +factor of 0.83: if there are three or more of either rows or columns, +the reduction factor is 0.66. +

+
+
mfg=c(2, 2, 3, 2)
+

Position of the current figure in a multiple figure environment. The first +two numbers are the row and column of the current figure; the last two +are the number of rows and columns in the multiple figure array. Set +this parameter to jump between figures in the array. You can even use +different values for the last two numbers than the true values +for unequally-sized figures on the same page. +

+
+
fig=c(4, 9, 1, 4)/10
+

Position of the current figure on the page. Values are the positions of +the left, right, bottom and top edges respectively, as a percentage of +the page measured from the bottom left corner. The example value would +be for a figure in the bottom right of the page. Set this parameter for +arbitrary positioning of figures within a page. If you want to add a +figure to a current page, use new=TRUE as well (unlike S). +

+
+
oma=c(2, 0, 3, 0)
+
omi=c(0, 0, 0.8, 0)
+

Size of outer margins. Like mar and mai, the first +measures in text lines and the second in inches, starting with the +bottom margin and working clockwise. +

+
+
+ +

Outer margins are particularly useful for page-wise titles, etc. Text +can be added to the outer margins with the mtext() function with +argument outer=TRUE. There are no outer margins by default, +however, so you must create them explicitly using oma or +omi. +

+

More complicated arrangements of multiple figures can be produced by the +split.screen() and layout() functions, as well as by the +grid and lattice packages. +

+
+ +
+

+Next: , Previous: , Up: Graphics   [Contents][Index]

+
+ +

12.6 Device drivers

+ + +

R can generate graphics (of varying levels of quality) on almost any +type of display or printing device. Before this can begin, however, +R needs to be informed what type of device it is dealing with. This +is done by starting a device driver. The purpose of a device +driver is to convert graphical instructions from R (“draw a line,” +for example) into a form that the particular device can understand. +

+

Device drivers are started by calling a device driver function. There +is one such function for every device driver: type help(Devices) +for a list of them all. For example, issuing the command +

+
+
> postscript()
+
+ +

causes all future graphics output to be sent to the printer in +PostScript format. Some commonly-used device drivers are: +

+
+
X11()
+
+

For use with the X11 window system on Unix-alikes +

+
windows()
+
+

For use on Windows +

+
quartz()
+
+

For use on OS X +

+
postscript()
+
+

For printing on PostScript printers, or creating PostScript graphics +files. +

+
pdf()
+
+

Produces a PDF file, which can also be included into PDF files. +

+
png()
+
+

Produces a bitmap PNG file. (Not always available: see its help page.) +

+
jpeg()
+
+

Produces a bitmap JPEG file, best used for image plots. +(Not always available: see its help page.) +

+
+ +

When you have finished with a device, be sure to terminate the device +driver by issuing the command +

+
+
> dev.off()
+
+ +

This ensures that the device finishes cleanly; for example in the case +of hardcopy devices this ensures that every page is completed and has +been sent to the printer. (This will happen automatically at the normal +end of a session.) +

+ + + + + +
+ + + +

12.6.1 PostScript diagrams for typeset documents

+ +

By passing the file argument to the postscript() device +driver function, you may store the graphics in PostScript format in a +file of your choice. The plot will be in landscape orientation unless +the horizontal=FALSE argument is given, and you can control the +size of the graphic with the width and height arguments +(the plot will be scaled as appropriate to fit these dimensions.) For +example, the command +

+
+
> postscript("file.ps", horizontal=FALSE, height=5, pointsize=10)
+
+ +

will produce a file containing PostScript code for a figure five inches +high, perhaps for inclusion in a document. It is important to note that +if the file named in the command already exists, it will be overwritten. +This is the case even if the file was only created earlier in the same +R session. +

+

Many usages of PostScript output will be to incorporate the figure in +another document. This works best when encapsulated PostScript +is produced: R always produces conformant output, but only marks the +output as such when the onefile=FALSE argument is supplied. This +unusual notation stems from S-compatibility: it really means that +the output will be a single page (which is part of the EPSF +specification). Thus to produce a plot for inclusion use something like +

+
+
> postscript("plot1.eps", horizontal=FALSE, onefile=FALSE,
+             height=8, width=6, pointsize=10)
+
+ + +
+ + + +

12.6.2 Multiple graphics devices

+ +

In advanced use of R it is often useful to have several graphics +devices in use at the same time. Of course only one graphics device can +accept graphics commands at any one time, and this is known as the +current device. When multiple devices are open, they form a +numbered sequence with names giving the kind of device at any position. +

+

The main commands used for operating with multiple devices, and their +meanings are as follows: +

+
+
X11()
+

[UNIX] +

+
windows()
+
win.printer()
+
win.metafile()
+

[Windows] +

+
quartz()
+

[OS X] +

+
postscript()
+
pdf()
+
png()
+
jpeg()
+
tiff()
+
bitmap()
+
+

Each new call to a device driver function opens a new graphics device, +thus extending by one the device list. This device becomes the current +device, to which graphics output will be sent. +

+
+
dev.list()
+
+

Returns the number and name of all active devices. The device at +position 1 on the list is always the null device which does not +accept graphics commands at all. +

+
+
dev.next()
+
dev.prev()
+
+ +

Returns the number and name of the graphics device next to, or previous +to the current device, respectively. +

+
+
dev.set(which=k)
+
+

Can be used to change the current graphics device to the one at position +k of the device list. Returns the number and label of the device. +

+
+
dev.off(k)
+
+

Terminate the graphics device at point k of the device list. For +some devices, such as postscript devices, this will either print +the file immediately or correctly complete the file for later printing, +depending on how the device was initiated. +

+
+
dev.copy(device, …, which=k)
+
dev.print(device, …, which=k)
+

Make a copy of the device k. Here device is a device +function, such as postscript, with extra arguments, if needed, +specified by ‘’. dev.print is similar, but the +copied device is immediately closed, so that end actions, such as +printing hardcopies, are immediately performed. +

+
+
graphics.off()
+

Terminate all graphics devices on the list, except the null device. +

+
+ +
+ +
+

+Previous: , Up: Graphics   [Contents][Index]

+
+ +

12.7 Dynamic graphics

+ + +

R does not have builtin capabilities for dynamic or +interactive graphics, e.g. rotating point clouds or to “brushing” +(interactively highlighting) points. However, extensive dynamic graphics +facilities are available in the system GGobi by Swayne, Cook and Buja +available from +

+
+

http://www.ggobi.org/ +

+ +

and these can be accessed from R via the package rggobi, described at +http://www.ggobi.org/rggobi. +

+

Also, package rgl provides ways to interact with 3D plots, for example +of surfaces. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

13 Packages

+ + +

All R functions and datasets are stored in packages. Only +when a package is loaded are its contents available. This is done both +for efficiency (the full list would take more memory and would take +longer to search than a subset), and to aid package developers, who are +protected from name clashes with other code. The process of developing +packages is described in Creating R +packages in Writing R Extensions. Here, we will describe them +from a user’s point of view. +

+

To see which packages are installed at your site, issue the command +

+
+
> library()
+
+ +

with no arguments. To load a particular package (e.g., the boot +package containing functions from Davison & Hinkley (1997)), use a +command like +

+
+
> library(boot)
+
+ +

Users connected to the Internet can use the install.packages() +and update.packages() functions (available through the +Packages menu in the Windows and OS X GUIs, see Installing +packages in R Installation and Administration) to install +and update packages. +

+

To see which packages are currently loaded, use +

+
+
> search()
+
+ +

to display the search list. Some packages may be loaded but not +available on the search list (see Namespaces): these will be +included in the list given by +

+
+
> loadedNamespaces()
+
+ + +

To see a list of all available help topics in an installed package, +use +

+
+
> help.start()
+
+ +

to start the HTML help system, and then navigate to the package +listing in the Reference section. +

+ + + + + + +
+ + + +

13.1 Standard packages

+ +

The standard (or base) packages are considered part of the R +source code. They contain the basic functions that allow R to work, +and the datasets and standard statistical and graphical functions that +are described in this manual. They should be automatically available in +any R installation. See R +packages in R FAQ, for a complete list. +

+
+ +
+

+Next: , Previous: , Up: Packages   [Contents][Index]

+
+ +

13.2 Contributed packages and CRAN

+ + +

There are thousands of contributed packages for R, written by many +different authors. Some of these packages implement specialized +statistical methods, others give access to data or hardware, and others +are designed to complement textbooks. Some (the recommended +packages) are distributed with every binary distribution of R. Most +are available for download from CRAN +(https://CRAN.R-project.org/ and its mirrors) and other +repositories such as Bioconductor (https://www.bioconductor.org/) +and Omegahat (http://www.omegahat.org/). The R FAQ +contains a list of CRAN packages current at the time of release, but the +collection of available packages changes very frequently. +

+
+ + + +

13.3 Namespaces

+ + + + +

All packages have namespaces, and have since R 2.14.0. +Namespaces do three things: they allow the package writer to hide +functions and data that are meant only for internal use, they prevent +functions from breaking when a user (or other package writer) picks a +name that clashes with one in the package, and they provide a way to +refer to an object within a particular package. +

+

For example, t() is the transpose function in R, but users +might define their own function named t. Namespaces prevent +the user’s definition from taking precedence, and breaking every +function that tries to transpose a matrix. +

+

There are two operators that work with namespaces. The double-colon +operator :: selects definitions from a particular namespace. +In the example above, the transpose function will always be available +as base::t, because it is defined in the base package. +Only functions that are exported from the package can be retrieved in +this way. +

+

The triple-colon operator ::: may be seen in a few places in R +code: it acts like the double-colon operator but also allows access to +hidden objects. Users are more likely to use the getAnywhere() +function, which searches multiple packages. +

+

Packages are often inter-dependent, and loading one may cause others to +be automatically loaded. The colon operators described above will also +cause automatic loading of the associated package. When packages with +namespaces are loaded automatically they are not added to the search +list. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

14 OS facilities

+ +

R has quite extensive facilities to access the OS under which it is +running: this allows it to be used as a scripting language and that +ability is much used by R itself, for example to install packages. +

+

Because R’s own scripts need to work across all platforms, +considerable effort has gone into make the scripting facilities as +platform-independent as is feasible. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: OS facilities   [Contents][Index]

+
+ +

14.1 Files and directories

+ +

There are many functions to manipulate files and directories. Here are +pointers to some of the more commonly used ones. +

+

To create an (empty) file or directory, use file.create or +create.dir. (These are the analogues of the POSIX utilities +touch and mkdir.) For temporary files and +directories in the R session directory see tempfile. +

+

Files can be removed by either file.remove or unlink: the +latter can remove directory trees. +

+

For directory listings use list.files (also available as +dir) or list.dirs. These can select files using a regular +expression: to select by wildcards use Sys.glob. +

+

Many types of information on a filepath (including for example if it is +a file or directory) can be found by file.info. +

+

There are several ways to find out if a file ‘exists’ (a file can +exist on the filesystem and not be visible to the current user). +There are functions file.exists, file.access and +file_test with various versions of this test: file_test is +a version of the POSIX test command for those familiar with +shell scripting. +

+

Function file.copy is the R analogue of the POSIX command +cp. +

+

Choosing files can be done interactively by file.choose: the +Windows port has the more versatile functions choose.files and +choose.dir and there are similar functions in the tcltk +package: tk_choose.files and tk_choose.dir. +

+

Functions file.show and file.edit will display and edit +one or more files in a way appropriate to the R port, using the +facilities of a console (such as RGui on Windows or R.app on OS X) if +one is in use. +

+

There is some support for links in the filesystem: see functions +file.link and Sys.readlink. +

+ +
+ + + +

14.2 Filepaths

+ +

With a few exceptions, R relies on the underlying OS functions to +manipulate filepaths. Some aspects of this are allowed to depend on the +OS, and do, even down to the version of the OS. There are POSIX +standards for how OSes should interpret filepaths and many R users +assume POSIX compliance: but Windows does not claim to be compliant and +other OSes may be less than completely compliant. +

+

The following are some issues which have been encountered with filepaths. +

+
    +
  • POSIX filesystems are case-sensitive, so foo.png and +Foo.PNG are different files. However, the defaults on Windows +and OS X are to be case-insensitive, and FAT filesystems (commonly used +on removable storage) are not normally case-sensitive (and all filepaths +may be mapped to lower case). + +
  • Almost all the Windows’ OS services support the use of slash or +backslash as the filepath separator, and R converts the known +exceptions to the form required by Windows. + +
  • The behaviour of filepaths with a trailing slash is OS-dependent. Such +paths are not valid on Windows and should not be expected to work. +POSIX-2008 requires such paths to match only directories, but earlier +versions allowed them to also match files. So they are best avoided. + +
  • Multiple slashes in filepaths such as /abc//def are valid on +POSIX filesystems and treated as if there was only one slash. They are +usually accepted by Windows’ OS functions. However, leading +double slashes may have a different meaning. + +
  • Windows’ UNC filepaths (such as \\server\dir1\dir2\file and +\\?\UNC\server\dir1\dir2\file) are not supported, but they may +work in some R functions. POSIX filesystems are allowed to treat a +leading double slash specially. + +
  • Windows allows filepaths containing drives and relative to the current +directory on a drive, e.g. d:foo/bar refers to +d:/a/b/c/foo/bar if the current directory on drive +d: is /a/b/c. It is intended that these work, but the +use of absolute paths is safer. +
+ +

Functions basename and dirname select parts of a file +path: the recommended way to assemble a file path from components is +file.path. Function pathexpand does ‘tilde expansion’, +substituting values for home directories (the current user’s, and +perhaps those of other users). +

+

On filesystems with links, a single file can be referred to by many +filepaths. Function normalizePath will find a canonical +filepath. +

+

Windows has the concepts of short (‘8.3’) and long file names: +normalizePath will return an absolute path using long file names +and shortPathName will return a version using short names. The +latter does not contain spaces and uses backslash as the separator, so +is sometimes useful for exporting names from R. +

+

File permissions are a related topic. R has support for the +POSIX concepts of read/write/execute permission for owner/group/all but +this may be only partially supported on the filesystem (so for example +on Windows only read-only files (for the account running the R +session) are recognized. Access Control Lists (ACLs) are employed on +several filesystems, but do not have an agreed standard and R has no +facilities to control them. Use Sys.chmod to change permissions. +

+
+ + + +

14.3 System commands

+ +

Functions system and system2 are used to invoke a system +command and optionally collect its output. system2 is a little +more general but its main advantage is that it is easier to write +cross-platform code using it. +

+

system behaves differently on Windows from other OSes (because +the API C call of that name does). Elsewhere it invokes a shell to run +the command: the Windows port of R has a function shell to do +that. +

+

To find out if the OS includes a command, use Sys.which, which +attempts to do this in a cross-platform way (unfortunately it is not a +standard OS service). +

+

Function shQuote will quote filepaths as needed for commands in +the current OS. +

+
+ +
+

+Previous: , Up: OS facilities   [Contents][Index]

+
+ +

14.4 Compression and Archives

+ +

Recent versions of R have extensive facilities to read and write +compressed files, often transparently. Reading of files in R is to a +vey large extent done by connections, and the file +function which is used to open a connection to a file (or a URL) and is +able to identify the compression used from the ‘magic’ header of the +file. +

+

The type of compression which has been supported for longest is +gzip compression, and that remains a good general compromise. +Files compressed by the earlier Unix compress utility can also +be read, but these are becoming rare. Two other forms of compression, +those of the bzip2 and xz utilities are also +available. These generally achieve higher rates of compression +(depending on the file, much higher) at the expense of slower +decompression and much slower compression. +

+

There is some confusion between xz and lzma +compression (see https://en.wikipedia.org/wiki/Xz and +https://en.wikipedia.org/wiki/LZMA): R can read files +compressed by most versions of either. +

+

File archives are single files which contain a collection of files, the +most common ones being ‘tarballs’ and zip files as used to distribute +R packages. R can list and unpack both (see functions untar +and unzip) and create both (for zip with the help of an +external program). +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Appendix A A sample session

+ +

The following session is intended to introduce to you some features of +the R environment by using them. Many features of the system will be +unfamiliar and puzzling at first, but this puzzlement will soon +disappear. +

+
+
Start R appropriately for your platform (see Invoking R).
+
+

The R program begins, with a banner. +

+

(Within R code, the prompt on the left hand side will not be shown to +avoid confusion.) +

+
+
help.start()
+

Start the HTML interface to on-line help (using a web browser +available at your machine). You should briefly explore the features of +this facility with the mouse. +

+

Iconify the help window and move on to the next part. +

+
+
x <- rnorm(50)
+
y <- rnorm(x)
+

Generate two pseudo-random normal vectors of x- and +y-coordinates. +

+
+
plot(x, y)
+

Plot the points in the plane. A graphics window will appear automatically. +

+
+
ls()
+

See which R objects are now in the R workspace. +

+
+
rm(x, y)
+

Remove objects no longer needed. (Clean up). +

+
+
x <- 1:20
+

Make x = (1, 2, …, 20). +

+
+
w <- 1 + sqrt(x)/2
+

A ‘weight’ vector of standard deviations. +

+
+
dummy <- data.frame(x=x, y= x + rnorm(x)*w)
+
dummy
+

Make a data frame of two columns, x and y, and look +at it. +

+
+
fm <- lm(y ~ x, data=dummy)
+
summary(fm)
+

Fit a simple linear regression and look at the +analysis. With y to the left of the tilde, +we are modelling y dependent on x. +

+
+
fm1 <- lm(y ~ x, data=dummy, weight=1/w^2)
+
summary(fm1)
+

Since we know the standard deviations, we can do a weighted regression. +

+
+
attach(dummy)
+

Make the columns in the data frame visible as variables. +

+
+
lrf <- lowess(x, y)
+

Make a nonparametric local regression function. +

+
+
plot(x, y)
+

Standard point plot. +

+
+
lines(x, lrf$y)
+

Add in the local regression. +

+
+
abline(0, 1, lty=3)
+

The true regression line: (intercept 0, slope 1). +

+
+
abline(coef(fm))
+

Unweighted regression line. +

+
+
abline(coef(fm1), col = "red")
+

Weighted regression line. +

+
+
detach()
+

Remove data frame from the search path. +

+
+
plot(fitted(fm), resid(fm),
+
     xlab="Fitted values",
+
     ylab="Residuals",
+
     main="Residuals vs Fitted")
+

A standard regression diagnostic plot to check for heteroscedasticity. +Can you see it? +

+
+
qqnorm(resid(fm), main="Residuals Rankit Plot")
+

A normal scores plot to check for skewness, kurtosis and outliers. (Not +very useful here.) +

+
+
rm(fm, fm1, lrf, x, dummy)
+

Clean up again. +

+
+ +

The next section will look at data from the classical experiment of +Michelson to measure the speed of light. This dataset is available in +the morley object, but we will read it to illustrate the +read.table function. +

+
+
filepath <- system.file("data", "morley.tab" , package="datasets")
+
filepath
+

Get the path to the data file. +

+
+
file.show(filepath)
+

Optional. Look at the file. +

+
+
mm <- read.table(filepath)
+
mm
+

Read in the Michelson data as a data frame, and look at it. +There are five experiments (column Expt) and each has 20 runs +(column Run) and sl is the recorded speed of light, +suitably coded. +

+
+
mm$Expt <- factor(mm$Expt)
+
mm$Run <- factor(mm$Run)
+

Change Expt and Run into factors. +

+
+
attach(mm)
+

Make the data frame visible at position 3 (the default). +

+
+
plot(Expt, Speed, main="Speed of Light Data", xlab="Experiment No.")
+

Compare the five experiments with simple boxplots. +

+
+
fm <- aov(Speed ~ Run + Expt, data=mm)
+
summary(fm)
+

Analyze as a randomized block, with ‘runs’ and ‘experiments’ as factors. +

+
+
fm0 <- update(fm, . ~ . - Run)
+
anova(fm0, fm)
+

Fit the sub-model omitting ‘runs’, and compare using a formal analysis +of variance. +

+
+
detach()
+
rm(fm, fm0)
+

Clean up before moving on. +

+
+
+ +

We now look at some more graphical features: contour and image plots. +

+
+
x <- seq(-pi, pi, len=50)
+
y <- x
+

x is a vector of 50 equally spaced values in +the interval [-pi\, pi]. +y is the same. +

+
+
f <- outer(x, y, function(x, y) cos(y)/(1 + x^2))
+

f is a square matrix, with rows and columns indexed by x +and y respectively, of values of the function +cos(y)/(1 + x^2). +

+
+
oldpar <- par(no.readonly = TRUE)
+
par(pty="s")
+

Save the plotting parameters and set the plotting region to “square”. +

+
+
contour(x, y, f)
+
contour(x, y, f, nlevels=15, add=TRUE)
+

Make a contour map of f; add in more lines for more detail. +

+
+
fa <- (f-t(f))/2
+

fa is the “asymmetric part” of f. (t() is +transpose). +

+
+
contour(x, y, fa, nlevels=15)
+

Make a contour plot, … +

+
+
par(oldpar)
+

… and restore the old graphics parameters. +

+
+
image(x, y, f)
+
image(x, y, fa)
+

Make some high density image plots, (of which you can get +hardcopies if you wish), … +

+
+
objects(); rm(x, y, f, fa)
+

… and clean up before moving on. +

+
+ +

R can do complex arithmetic, also. +

+
+
th <- seq(-pi, pi, len=100)
+
z <- exp(1i*th)
+

1i is used for the complex number i. +

+
+
par(pty="s")
+
plot(z, type="l")
+

Plotting complex arguments means plot imaginary versus real parts. This +should be a circle. +

+
+
w <- rnorm(100) + rnorm(100)*1i
+

Suppose we want to sample points within the unit circle. One method +would be to take complex numbers with standard normal real and imaginary +parts … +

+
+
w <- ifelse(Mod(w) > 1, 1/w, w)
+

… and to map any outside the circle onto their reciprocal. +

+
+
plot(w, xlim=c(-1,1), ylim=c(-1,1), pch="+",xlab="x", ylab="y")
+
lines(z)
+

All points are inside the unit circle, but the distribution is not +uniform. +

+
+
w <- sqrt(runif(100))*exp(2*pi*runif(100)*1i)
+
plot(w, xlim=c(-1,1), ylim=c(-1,1), pch="+", xlab="x", ylab="y")
+
lines(z)
+

The second method uses the uniform distribution. The points should now +look more evenly spaced over the disc. +

+
+
rm(th, w, z)
+

Clean up again. +

+
+
q()
+

Quit the R program. You will be asked if you want to save the R +workspace, and for an exploratory session like this, you probably do not +want to save it. +

+
+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Appendix B Invoking R

+ +

Users of R on Windows or OS X should read the OS-specific section +first, but command-line use is also supported. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: Invoking R   [Contents][Index]

+
+ +

B.1 Invoking R from the command line

+ +

When working at a command line on UNIX or Windows, the command ‘R’ +can be used both for starting the main R program in the form +

+
+
R [options] [<infile] [>outfile],
+
+ +

or, via the R CMD interface, as a wrapper to various R tools +(e.g., for processing files in R documentation format or manipulating +add-on packages) which are not intended to be called “directly”. +

+

At the Windows command-line, Rterm.exe is preferred to +R. +

+

You need to ensure that either the environment variable TMPDIR is +unset or it points to a valid place to create temporary files and +directories. +

+

Most options control what happens at the beginning and at the end of an +R session. The startup mechanism is as follows (see also the on-line +help for topic ‘Startup’ for more information, and the section below +for some Windows-specific details). +

+
    +
  • Unless --no-environ was given, R searches for user and site +files to process for setting environment variables. The name of the +site file is the one pointed to by the environment variable +R_ENVIRON; if this is unset, R_HOME/etc/Renviron.site +is used (if it exists). The user file is the one pointed to by the +environment variable R_ENVIRON_USER if this is set; otherwise, +files .Renviron in the current or in the user’s home directory +(in that order) are searched for. These files should contain lines of +the form ‘name=value’. (See help("Startup") for +a precise description.) Variables you might want to set include +R_PAPERSIZE (the default paper size), R_PRINTCMD (the +default print command) and R_LIBS (specifies the list of R +library trees searched for add-on packages). + +
  • Then R searches for the site-wide startup profile unless the command +line option --no-site-file was given. The name of this file is +taken from the value of the R_PROFILE environment variable. If +that variable is unset, the default +R_HOME/etc/Rprofile.site is used if this exists. + +
  • Then, unless --no-init-file was given, R searches for a user +profile and sources it. The name of this file is taken from the +environment variable R_PROFILE_USER; if unset, a file called +.Rprofile in the current directory or in the user’s home +directory (in that order) is searched for. + +
  • It also loads a saved workspace from file .RData in the current +directory if there is one (unless --no-restore or +--no-restore-data was specified). + +
  • Finally, if a function .First() exists, it is executed. This +function (as well as .Last() which is executed at the end of the +R session) can be defined in the appropriate startup profiles, or +reside in .RData. +
+ +

In addition, there are options for controlling the memory available to +the R process (see the on-line help for topic ‘Memory’ for more +information). Users will not normally need to use these unless they +are trying to limit the amount of memory used by R. +

+

R accepts the following command-line options. +

+
+
--help
+
-h
+

Print short help message to standard output and exit successfully. +

+
+
--version
+

Print version information to standard output and exit successfully. +

+
+
--encoding=enc
+

Specify the encoding to be assumed for input from the console or +stdin. This needs to be an encoding known to iconv: see +its help page. (--encoding enc is also accepted.) The +input is re-encoded to the locale R is running in and needs to be +representable in the latter’s encoding (so e.g. you cannot re-encode +Greek text in a French locale unless that locale uses the UTF-8 +encoding). +

+
+
RHOME
+

Print the path to the R “home directory” to standard output and +exit successfully. Apart from the front-end shell script and the man +page, R installation puts everything (executables, packages, etc.) +into this directory. +

+
+
--save
+
--no-save
+

Control whether data sets should be saved or not at the end of the R +session. If neither is given in an interactive session, the user is +asked for the desired behavior when ending the session with q(); +in non-interactive use one of these must be specified or implied by some +other option (see below). +

+
+
--no-environ
+

Do not read any user file to set environment variables. +

+
+
--no-site-file
+

Do not read the site-wide profile at startup. +

+
+
--no-init-file
+

Do not read the user’s profile at startup. +

+
+
--restore
+
--no-restore
+
--no-restore-data
+

Control whether saved images (file .RData in the directory where +R was started) should be restored at startup or not. The default is +to restore. (--no-restore implies all the specific +--no-restore-* options.) +

+
+
--no-restore-history
+

Control whether the history file (normally file .Rhistory in the +directory where R was started, but can be set by the environment +variable R_HISTFILE) should be restored at startup or not. The +default is to restore. +

+
+
--no-Rconsole
+

(Windows only) Prevent loading the Rconsole file at startup. +

+
+
--vanilla
+

Combine --no-save, --no-environ, +--no-site-file, --no-init-file and +--no-restore. Under Windows, this also includes +--no-Rconsole. +

+
+
-f file
+
--file=file
+

(not Rgui.exe) Take input from file: ‘-’ means +stdin. Implies --no-save unless --save has +been set. On a Unix-alike, shell metacharacters should be avoided in +file (but spaces are allowed). +

+
+
-e expression
+

(not Rgui.exe) Use expression as an input line. One or +more -e options can be used, but not together with -f +or --file. Implies --no-save unless --save +has been set. (There is a limit of 10,000 bytes on the total length of +expressions used in this way. Expressions containing spaces or shell +metacharacters will need to be quoted.) +

+
+
--no-readline
+

(UNIX only) Turn off command-line editing via readline. This +is useful when running R from within Emacs using the ESS +(“Emacs Speaks Statistics”) package. See The command-line editor, +for more information. Command-line editing is enabled for default +interactive use (see --interactive). This option also affects +tilde-expansion: see the help for path.expand. +

+
+
--min-vsize=N
+
--min-nsize=N
+

For expert use only: set the initial trigger sizes for garbage +collection of vector heap (in bytes) and cons cells (number) +respectively. Suffix ‘M’ specifies megabytes or millions of cells +respectively. The defaults are 6Mb and 350k respectively and can also +be set by environment variables R_NSIZE and R_VSIZE. +

+
+
--max-ppsize=N
+

Specify the maximum size of the pointer protection stack as N +locations. This defaults to 10000, but can be increased to allow +large and complicated calculations to be done. Currently the maximum +value accepted is 100000. +

+
+
--max-mem-size=N
+

(Windows only) Specify a limit for the amount of memory to be used both +for R objects and working areas. This is set by default to the +smaller of the amount of physical RAM in the machine and for 32-bit +R, 1.5Gb26, and must be between 32Mb and the +maximum allowed on that version of Windows. +

+
+
--quiet
+
--silent
+
-q
+

Do not print out the initial copyright and welcome messages. +

+
+
--slave
+

Make R run as quietly as possible. This option is intended to +support programs which use R to compute results for them. It implies +--quiet and --no-save. +

+
+
--interactive
+

(UNIX only) Assert that R really is being run interactively even if +input has been redirected: use if input is from a FIFO or pipe and fed +from an interactive program. (The default is to deduce that R is +being run interactively if and only if stdin is connected to a +terminal or pty.) Using -e, -f or +--file asserts non-interactive use even if +--interactive is given. +

+

Note that this does not turn on command-line editing. +

+
+
--ess
+

(Windows only) Set Rterm up for use by R-inferior-mode in +ESS, including asserting interactive use (without the +command-line editor) and no buffering of stdout. +

+
+
--verbose
+

Print more information about progress, and in particular set R’s +option verbose to TRUE. R code uses this option to +control the printing of diagnostic messages. +

+
+
--debugger=name
+
-d name
+

(UNIX only) Run R through debugger name. For most debuggers +(the exceptions are valgrind and recent versions of +gdb), further command line options are disregarded, and should +instead be given when starting the R executable from inside the +debugger. +

+
+
--gui=type
+
-g type
+

(UNIX only) Use type as graphical user interface (note that this +also includes interactive graphics). Currently, possible values for +type are ‘X11’ (the default) and, provided that ‘Tcl/Tk’ +support is available, ‘Tk’. (For back-compatibility, ‘x11’ and +‘tk’ are accepted.) +

+
+
--arch=name
+

(UNIX only) Run the specified sub-architecture. +

+
+
--args
+

This flag does nothing except cause the rest of the command line to be +skipped: this can be useful to retrieve values from it with +commandArgs(TRUE). +

+
+ +

Note that input and output can be redirected in the usual way (using +‘<’ and ‘>’), but the line length limit of 4095 bytes still +applies. Warning and error messages are sent to the error channel +(stderr). +

+

The command R CMD allows the invocation of various tools which +are useful in conjunction with R, but not intended to be called +“directly”. The general form is +

+
+
R CMD command args
+
+ +

where command is the name of the tool and args the arguments +passed on to it. +

+

Currently, the following tools are available. +

+
+
BATCH
+

Run R in batch mode. Runs R --restore --save with possibly +further options (see ?BATCH). +

+
COMPILE
+

(UNIX only) Compile C, C++, Fortran … files for use with R. +

+
SHLIB
+

Build shared library for dynamic loading. +

+
INSTALL
+

Install add-on packages. +

+
REMOVE
+

Remove add-on packages. +

+
build
+

Build (that is, package) add-on packages. +

+
check
+

Check add-on packages. +

+
LINK
+

(UNIX only) Front-end for creating executable programs. +

+
Rprof
+

Post-process R profiling files. +

+
Rdconv
+
Rd2txt
+

Convert Rd format to various other formats, including HTML, LaTeX, +plain text, and extracting the examples. Rd2txt can be used as +shorthand for Rd2conv -t txt. +

+
Rd2pdf
+

Convert Rd format to PDF. +

+
Stangle
+

Extract S/R code from Sweave or other vignette documentation +

+
Sweave
+

Process Sweave or other vignette documentation +

+
Rdiff
+

Diff R output ignoring headers etc +

+
config
+

Obtain configuration information +

+
javareconf
+

(Unix only) Update the Java configuration variables +

+
rtags
+

(Unix only) Create Emacs-style tag files from C, R, and Rd files +

+
open
+

(Windows only) Open a file via Windows’ file associations +

+
texify
+

(Windows only) Process (La)TeX files with R’s style files +

+
+ +

Use +

+
+
R CMD command --help
+
+ +

to obtain usage information for each of the tools accessible via the +R CMD interface. +

+

In addition, you can use options --arch=, +--no-environ, --no-init-file, --no-site-file +and --vanilla between R and CMD: these +affect any R processes run by the tools. (Here --vanilla is +equivalent to --no-environ --no-site-file --no-init-file.) +However, note that R CMD does not of itself use any R +startup files (in particular, neither user nor site Renviron +files), and all of the R processes run by these tools (except +BATCH) use --no-restore. Most use --vanilla +and so invoke no R startup files: the current exceptions are +INSTALL, REMOVE, Sweave and +SHLIB (which uses --no-site-file --no-init-file). +

+
+
R CMD cmd args
+
+ +

for any other executable cmd on the path or given by an +absolute filepath: this is useful to have the same environment as R +or the specific commands run under, for example to run ldd or +pdflatex. Under Windows cmd can be an executable or a +batch file, or if it has extension .sh or .pl the +appropriate interpreter (if available) is called to run it. +

+ +
+ + + +

B.2 Invoking R under Windows

+ +

There are two ways to run R under Windows. Within a terminal window +(e.g. cmd.exe or a more capable shell), the methods described in +the previous section may be used, invoking by R.exe or more +directly by Rterm.exe. For interactive use, there is a +console-based GUI (Rgui.exe). +

+

The startup procedure under Windows is very similar to that under +UNIX, but references to the ‘home directory’ need to be clarified, as +this is not always defined on Windows. If the environment variable +R_USER is defined, that gives the home directory. Next, if the +environment variable HOME is defined, that gives the home +directory. After those two user-controllable settings, R tries to +find system defined home directories. It first tries to use the +Windows "personal" directory (typically C:\Documents and +Settings\username\My Documents in Windows XP). If that fails, and +environment variables HOMEDRIVE and HOMEPATH are defined +(and they normally are) these define the home directory. Failing all +those, the home directory is taken to be the starting directory. +

+

You need to ensure that either the environment variables TMPDIR, +TMP and TEMP are either unset or one of them points to a +valid place to create temporary files and directories. +

+

Environment variables can be supplied as ‘name=value’ +pairs on the command line. +

+

If there is an argument ending .RData (in any case) it is +interpreted as the path to the workspace to be restored: it implies +--restore and sets the working directory to the parent of the +named file. (This mechanism is used for drag-and-drop and file +association with RGui.exe, but also works for Rterm.exe. +If the named file does not exist it sets the working directory +if the parent directory exists.) +

+

The following additional command-line options are available when +invoking RGui.exe. +

+
+
--mdi
+
--sdi
+
--no-mdi
+

Control whether Rgui will operate as an MDI program +(with multiple child windows within one main window) or an SDI application +(with multiple top-level windows for the console, graphics and pager). The +command-line setting overrides the setting in the user’s Rconsole file. +

+
+
--debug
+

Enable the “Break to debugger” menu item in Rgui, and trigger +a break to the debugger during command line processing. +

+
+ +

Under Windows with R CMD you may also specify your own +.bat, .exe, .sh or .pl file. It will be run +under the appropriate interpreter (Perl for .pl) with several +environment variables set appropriately, including R_HOME, +R_OSTYPE, PATH, BSTINPUTS and TEXINPUTS. For +example, if you already have latex.exe on your path, then +

+
+
R CMD latex.exe mydoc
+
+

will run LaTeX on mydoc.tex, with the path to R’s +share/texmf macros appended to TEXINPUTS. (Unfortunately, +this does not help with the MiKTeX build of LaTeX, but +R CMD texify mydoc will work in that case.) +

+
+ + + +

B.3 Invoking R under OS X

+ +

There are two ways to run R under OS X. Within a Terminal.app +window by invoking R, the methods described in the first +subsection apply. There is also console-based GUI (R.app) that by +default is installed in the Applications folder on your +system. It is a standard double-clickable OS X application. +

+

The startup procedure under OS X is very similar to that under UNIX, but +R.app does not make use of command-line arguments. The ‘home +directory’ is the one inside the R.framework, but the startup and +current working directory are set as the user’s home directory unless a +different startup directory is given in the Preferences window +accessible from within the GUI. +

+
+ +
+

+Previous: , Up: Invoking R   [Contents][Index]

+
+ +

B.4 Scripting with R

+ +

If you just want to run a file foo.R of R commands, the +recommended way is to use R CMD BATCH foo.R. If you want to +run this in the background or as a batch job use OS-specific facilities +to do so: for example in most shells on Unix-alike OSes R CMD +BATCH foo.R & runs a background job. +

+

You can pass parameters to scripts via additional arguments on the +command line: for example (where the exact quoting needed will depend on +the shell in use) +

+
+
R CMD BATCH "--args arg1 arg2" foo.R &
+
+ +

will pass arguments to a script which can be retrieved as a character +vector by +

+
+
args <- commandArgs(TRUE)
+
+ +

This is made simpler by the alternative front-end Rscript, +which can be invoked by +

+
+
Rscript foo.R arg1 arg2
+
+ +

and this can also be used to write executable script files like (at +least on Unix-alikes, and in some Windows shells) +

+
+
#! /path/to/Rscript
+args <- commandArgs(TRUE)
+...
+q(status=<exit status code>)
+
+ +

If this is entered into a text file runfoo and this is made +executable (by chmod 755 runfoo), it can be invoked for +different arguments by +

+
+
runfoo arg1 arg2
+
+ +

For further options see help("Rscript"). This writes R +output to stdout and stderr, and this can be redirected in +the usual way for the shell running the command. +

+

If you do not wish to hardcode the path to Rscript but have it +in your path (which is normally the case for an installed R except on +Windows, but e.g. OS X users may need to add /usr/local/bin +to their path), use +

+
+
#! /usr/bin/env Rscript
+...
+
+ +

At least in Bourne and bash shells, the #! mechanism does +not allow extra arguments like +#! /usr/bin/env Rscript --vanilla. +

+

One thing to consider is what stdin() refers to. It is +commonplace to write R scripts with segments like +

+
+
chem <- scan(n=24)
+2.90 3.10 3.40 3.40 3.70 3.70 2.80 2.50 2.40 2.40 2.70 2.20
+5.28 3.37 3.03 3.03 28.95 3.77 3.40 2.20 3.50 3.60 3.70 3.70
+
+ +

and stdin() refers to the script file to allow such traditional +usage. If you want to refer to the process’s stdin, use +"stdin" as a file connection, e.g. scan("stdin", ...). +

+

Another way to write executable script files (suggested by François +Pinard) is to use a here document like +

+
+
#!/bin/sh
+[environment variables can be set here]
+R --slave [other options] <<EOF
+
+   R program goes here...
+
+EOF
+
+ +

but here stdin() refers to the program source and +"stdin" will not be usable. +

+

Short scripts can be passed to Rscript on the command-line +via the -e flag. (Empty scripts are not accepted.) +

+

Note that on a Unix-alike the input filename (such as foo.R) +should not contain spaces nor shell metacharacters. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Appendix C The command-line editor

+ + +

C.1 Preliminaries

+ +

When the GNU readline library is available at the +time R is configured for compilation under UNIX, an inbuilt command +line editor allowing recall, editing and re-submission of prior commands +is used. Note that other versions of readline exist and may be +used by the inbuilt command line editor: this used to happen on OS X. +

+

It can be disabled (useful for usage with ESS 27) using the startup option +--no-readline. +

+

Windows versions of R have somewhat simpler command-line editing: see +‘Console’ under the ‘Help’ menu of the GUI, and the +file README.Rterm for command-line editing under +Rterm.exe. +

+

When using R with readline capabilities, the functions +described below are available, as well as others (probably) documented +in man readline or info readline on your system. +

+

Many of these use either Control or Meta characters. Control +characters, such as Control-m, are obtained by holding the +CTRL down while you press the m key, and are written as +C-m below. Meta characters, such as Meta-b, are typed by +holding down META28 and pressing b, and written as M-b +in the following. If your terminal does not have a META key +enabled, you can still type Meta characters using two-character +sequences starting with ESC. Thus, to enter M-b, you could +type ESCb. The ESC character sequences are also +allowed on terminals with real Meta keys. Note that case is significant +for Meta characters. +

+ +

C.2 Editing actions

+ +

The R program keeps a history of the command lines you type, +including the erroneous lines, and commands in your history may be +recalled, changed if necessary, and re-submitted as new commands. In +Emacs-style command-line editing any straight typing you do while in +this editing phase causes the characters to be inserted in the command +you are editing, displacing any characters to the right of the cursor. +In vi mode character insertion mode is started by M-i or +M-a, characters are typed and insertion mode is finished by typing +a further ESC. (The default is Emacs-style, and only that is +described here: for vi mode see the readline +documentation.) +

+

Pressing the RET command at any time causes the command to be +re-submitted. +

+

Other editing actions are summarized in the following table. +

+ +

C.3 Command-line editor summary

+ + +

Command recall and vertical motion

+ +
+
C-p
+

Go to the previous command (backwards in the history). +

+
C-n
+

Go to the next command (forwards in the history). +

+
C-r text
+

Find the last command with the text string in it. +

+
+ +

On most terminals, you can also use the up and down arrow keys instead +of C-p and C-n, respectively. +

+ +

Horizontal motion of the cursor

+ +
+
C-a
+

Go to the beginning of the command. +

+
C-e
+

Go to the end of the line. +

+
M-b
+

Go back one word. +

+
M-f
+

Go forward one word. +

+
C-b
+

Go back one character. +

+
C-f
+

Go forward one character. +

+
+ +

On most terminals, you can also use the left and right arrow keys +instead of C-b and C-f, respectively. +

+ +

Editing and re-submission

+ +
+
text
+

Insert text at the cursor. +

+
C-f text
+

Append text after the cursor. +

+
DEL
+

Delete the previous character (left of the cursor). +

+
C-d
+

Delete the character under the cursor. +

+
M-d
+

Delete the rest of the word under the cursor, and “save” it. +

+
C-k
+

Delete from cursor to end of command, and “save” it. +

+
C-y
+

Insert (yank) the last “saved” text here. +

+
C-t
+

Transpose the character under the cursor with the next. +

+
M-l
+

Change the rest of the word to lower case. +

+
M-c
+

Change the rest of the word to upper case. +

+
RET
+

Re-submit the command to R. +

+
+ +

The final RET terminates the command line editing sequence. +

+

The readline key bindings can be customized in the usual way +via a ~/.inputrc file. These customizations can be +conditioned on application R, that is by including a section like +

+
+
$if R
+  "\C-xd": "q('no')\n"
+$endif
+
+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Appendix D Function and variable index

+ +
Jump to:   ! +   +% +   +& +   +* +   ++ +   +- +   +. +   +/ +   +: +   +< +   += +   +> +   +? +   +^ +   +| +   +~ +   +
+A +   +B +   +C +   +D +   +E +   +F +   +G +   +H +   +I +   +J +   +K +   +L +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +V +   +W +   +X +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

!
!: Logical vectors
!=: Logical vectors

%
%*%: Multiplication
%o%: The outer product of two arrays

&
&: Logical vectors
&&: Conditional execution

*
*: Vector arithmetic

+
+: Vector arithmetic

-
-: Vector arithmetic

.
.: Updating fitted models
.First: Customizing the environment
.Last: Customizing the environment

/
/: Vector arithmetic

:
:: Generating regular sequences
::: Namespaces
:::: Namespaces

<
<: Logical vectors
<<-: Scope
<=: Logical vectors

=
==: Logical vectors

>
>: Logical vectors
>=: Logical vectors

?
?: Getting help
??: Getting help

^
^: Vector arithmetic

|
|: Logical vectors
||: Conditional execution

~
~: Formulae for statistical models

A
abline: Low-level plotting commands
ace: Some non-standard models
add1: Updating fitted models
anova: Generic functions for extracting model information
anova: ANOVA tables
aov: Analysis of variance and model comparison
aperm: Generalized transpose of an array
array: The array() function
as.data.frame: Making data frames
as.vector: The concatenation function c() with arrays
attach: attach() and detach()
attr: Getting and setting attributes
attr: Getting and setting attributes
attributes: Getting and setting attributes
attributes: Getting and setting attributes
avas: Some non-standard models
axis: Low-level plotting commands

B
boxplot: One- and two-sample tests
break: Repetitive execution
bruto: Some non-standard models

C
c: Vectors and assignment
c: Character vectors
c: The concatenation function c() with arrays
c: Concatenating lists
C: Contrasts
cbind: Forming partitioned matrices
coef: Generic functions for extracting model information
coefficients: Generic functions for extracting model information
contour: Display graphics
contrasts: Contrasts
coplot: Displaying multivariate data
cos: Vector arithmetic
crossprod: Index matrices
crossprod: Multiplication
cut: Frequency tables from factors

D
data: Accessing builtin datasets
data.frame: Making data frames
density: Examining the distribution of a set of data
det: Singular value decomposition and determinants
detach: attach() and detach()
determinant: Singular value decomposition and determinants
dev.list: Multiple graphics devices
dev.next: Multiple graphics devices
dev.off: Multiple graphics devices
dev.prev: Multiple graphics devices
dev.set: Multiple graphics devices
deviance: Generic functions for extracting model information
diag: Multiplication
dim: Arrays
dotchart: Display graphics
drop1: Updating fitted models

E
ecdf: Examining the distribution of a set of data
edit: Editing data
eigen: Eigenvalues and eigenvectors
else: Conditional execution
Error: Analysis of variance and model comparison
example: Getting help
exp: Vector arithmetic

F
F: Logical vectors
factor: Factors
FALSE: Logical vectors
fivenum: Examining the distribution of a set of data
for: Repetitive execution
formula: Generic functions for extracting model information
function: Writing your own functions

G
getAnywhere: Object orientation
getS3method: Object orientation
glm: The glm() function

H
help: Getting help
help: Getting help
help.search: Getting help
help.start: Getting help
hist: Examining the distribution of a set of data
hist: Display graphics

I
identify: Interacting with graphics
if: Conditional execution
if: Conditional execution
ifelse: Conditional execution
image: Display graphics
is.na: Missing values
is.nan: Missing values

J
jpeg: Device drivers

K
ks.test: Examining the distribution of a set of data

L
legend: Low-level plotting commands
length: Vector arithmetic
length: The intrinsic attributes mode and length
levels: Factors
lines: Low-level plotting commands
list: Lists
lm: Linear models
lme: Some non-standard models
locator: Interacting with graphics
loess: Some non-standard models
loess: Some non-standard models
log: Vector arithmetic
lqs: Some non-standard models
lsfit: Least squares fitting and the QR decomposition

M
mars: Some non-standard models
max: Vector arithmetic
mean: Vector arithmetic
methods: Object orientation
min: Vector arithmetic
mode: The intrinsic attributes mode and length

N
NA: Missing values
NaN: Missing values
ncol: Matrix facilities
next: Repetitive execution
nlm: Nonlinear least squares and maximum likelihood models
nlm: Least squares
nlm: Maximum likelihood
nlme: Some non-standard models
nlminb: Nonlinear least squares and maximum likelihood models
nrow: Matrix facilities

O
optim: Nonlinear least squares and maximum likelihood models
order: Vector arithmetic
ordered: Ordered factors
ordered: Ordered factors
outer: The outer product of two arrays

P
pairs: Displaying multivariate data
par: The par() function
paste: Character vectors
pdf: Device drivers
persp: Display graphics
plot: Generic functions for extracting model information
plot: The plot() function
pmax: Vector arithmetic
pmin: Vector arithmetic
png: Device drivers
points: Low-level plotting commands
polygon: Low-level plotting commands
postscript: Device drivers
predict: Generic functions for extracting model information
print: Generic functions for extracting model information
prod: Vector arithmetic

Q
qqline: Examining the distribution of a set of data
qqline: Display graphics
qqnorm: Examining the distribution of a set of data
qqnorm: Display graphics
qqplot: Display graphics
qr: Least squares fitting and the QR decomposition
quartz: Device drivers

R
range: Vector arithmetic
rbind: Forming partitioned matrices
read.table: The read.table() function
rep: Generating regular sequences
repeat: Repetitive execution
resid: Generic functions for extracting model information
residuals: Generic functions for extracting model information
rlm: Some non-standard models
rm: Data permanency and removing objects

S
scan: The scan() function
sd: The function tapply() and ragged arrays
search: Managing the search path
seq: Generating regular sequences
shapiro.test: Examining the distribution of a set of data
sin: Vector arithmetic
sink: Executing commands from or diverting output to a file
solve: Linear equations and inversion
sort: Vector arithmetic
source: Executing commands from or diverting output to a file
split: Repetitive execution
sqrt: Vector arithmetic
stem: Examining the distribution of a set of data
step: Generic functions for extracting model information
step: Updating fitted models
sum: Vector arithmetic
summary: Examining the distribution of a set of data
summary: Generic functions for extracting model information
svd: Singular value decomposition and determinants

T
T: Logical vectors
t: Generalized transpose of an array
t.test: One- and two-sample tests
table: Index matrices
table: Frequency tables from factors
tan: Vector arithmetic
tapply: The function tapply() and ragged arrays
text: Low-level plotting commands
title: Low-level plotting commands
tree: Some non-standard models
TRUE: Logical vectors

U
unclass: The class of an object
update: Updating fitted models

V
var: Vector arithmetic
var: The function tapply() and ragged arrays
var.test: One- and two-sample tests
vcov: Generic functions for extracting model information
vector: Vectors and assignment

W
while: Repetitive execution
wilcox.test: One- and two-sample tests
windows: Device drivers

X
X11: Device drivers

+
Jump to:   ! +   +% +   +& +   +* +   ++ +   +- +   +. +   +/ +   +: +   +< +   += +   +> +   +? +   +^ +   +| +   +~ +   +
+A +   +B +   +C +   +D +   +E +   +F +   +G +   +H +   +I +   +J +   +K +   +L +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +V +   +W +   +X +   +
+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Appendix E Concept index

+ +
Jump to:   A +   +B +   +C +   +D +   +E +   +F +   +G +   +I +   +K +   +L +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +V +   +W +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

A
Accessing builtin datasets: Accessing builtin datasets
Additive models: Some non-standard models
Analysis of variance: Analysis of variance and model comparison
Arithmetic functions and operators: Vector arithmetic
Arrays: Arrays
Assignment: Vectors and assignment
Attributes: Objects

B
Binary operators: Defining new binary operators
Box plots: One- and two-sample tests

C
Character vectors: Character vectors
Classes: The class of an object
Classes: Object orientation
Concatenating lists: Concatenating lists
Contrasts: Contrasts
Control statements: Control statements
CRAN: Contributed packages and CRAN
Customizing the environment: Customizing the environment

D
Data frames: Data frames
Default values: Named arguments and defaults
Density estimation: Examining the distribution of a set of data
Determinants: Singular value decomposition and determinants
Diverting input and output: Executing commands from or diverting output to a file
Dynamic graphics: Dynamic graphics

E
Eigenvalues and eigenvectors: Eigenvalues and eigenvectors
Empirical CDFs: Examining the distribution of a set of data

F
Factors: Factors
Factors: Contrasts
Families: Families
Formulae: Formulae for statistical models

G
Generalized linear models: Generalized linear models
Generalized transpose of an array: Generalized transpose of an array
Generic functions: Object orientation
Graphics device drivers: Device drivers
Graphics parameters: The par() function
Grouped expressions: Grouped expressions

I
Indexing of and by arrays: Array indexing
Indexing vectors: Index vectors

K
Kolmogorov-Smirnov test: Examining the distribution of a set of data

L
Least squares fitting: Least squares fitting and the QR decomposition
Linear equations: Linear equations and inversion
Linear models: Linear models
Lists: Lists
Local approximating regressions: Some non-standard models
Loops and conditional execution: Loops and conditional execution

M
Matrices: Arrays
Matrix multiplication: Multiplication
Maximum likelihood: Maximum likelihood
Missing values: Missing values
Mixed models: Some non-standard models

N
Named arguments: Named arguments and defaults
Namespace: Namespaces
Nonlinear least squares: Nonlinear least squares and maximum likelihood models

O
Object orientation: Object orientation
Objects: Objects
One- and two-sample tests: One- and two-sample tests
Ordered factors: Factors
Ordered factors: Contrasts
Outer products of arrays: The outer product of two arrays

P
Packages: R and statistics
Packages: Packages
Probability distributions: Probability distributions

Q
QR decomposition: Least squares fitting and the QR decomposition
Quantile-quantile plots: Examining the distribution of a set of data

R
Reading data from files: Reading data from files
Recycling rule: Vector arithmetic
Recycling rule: The recycling rule
Regular sequences: Generating regular sequences
Removing objects: Data permanency and removing objects
Robust regression: Some non-standard models

S
Scope: Scope
Search path: Managing the search path
Shapiro-Wilk test: Examining the distribution of a set of data
Singular value decomposition: Singular value decomposition and determinants
Statistical models: Statistical models in R
Student’s t test: One- and two-sample tests

T
Tabulation: Frequency tables from factors
Tree-based models: Some non-standard models

U
Updating fitted models: Updating fitted models

V
Vectors: Simple manipulations numbers and vectors

W
Wilcoxon test: One- and two-sample tests
Workspace: Data permanency and removing objects
Writing functions: Writing your own functions

+
Jump to:   A +   +B +   +C +   +D +   +E +   +F +   +G +   +I +   +K +   +L +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +V +   +W +   +
+ +
+ +
+

+Previous: , Up: Top   [Contents][Index]

+
+ +

Appendix F References

+ +

D. M. Bates and D. G. Watts (1988), Nonlinear Regression +Analysis and Its Applications. John Wiley & Sons, New York. +

+

Richard A. Becker, John M. Chambers and Allan R. Wilks (1988), +The New S Language. Chapman & Hall, New York. +This book is often called the “Blue Book”. +

+

John M. Chambers and Trevor J. Hastie eds. (1992), +Statistical Models in S. Chapman & Hall, New York. +This is also called the “White Book”. +

+

John M. Chambers (1998) +Programming with Data. Springer, New York. +This is also called the “Green Book”. +

+

A. C. Davison and D. V. Hinkley (1997), Bootstrap Methods +and Their Applications, Cambridge University Press. +

+

Annette J. Dobson (1990), An Introduction to Generalized Linear +Models, Chapman and Hall, London. +

+

Peter McCullagh and John A. Nelder (1989), Generalized Linear +Models. Second edition, Chapman and Hall, London. +

+

John A. Rice (1995), Mathematical Statistics and Data Analysis. +Second edition. Duxbury Press, Belmont, CA. +

+

S. D. Silvey (1970), Statistical Inference. Penguin, London. +

+
+
+

Footnotes

+ +

(1)

+

ACM Software Systems award, 1998: +https://awards.acm.org/award_winners/chambers_6640862.cfm.

+

(2)

+

For portable R code (including that to +be used in R packages) only A–Za–z0–9 should be used.

+

(3)

+

not inside strings, +nor within the argument list of a function definition

+

(4)

+

some of the +consoles will not allow you to enter more, and amongst those which do +some will silently discard the excess and some will use it as the start +of the next line.

+

(5)

+

of unlimited length.

+

(6)

+

The leading “dot” in +this file name makes it invisible in normal file listings in +UNIX, and in default GUI file listings on OS X and Windows.

+

(7)

+

With other than vector types of argument, +such as list mode arguments, the action of c() is rather +different. See Concatenating lists.

+

(8)

+

Actually, it is still available as +.Last.value before any other statements are executed.

+

(9)

+

paste(..., collapse=ss) joins the +arguments into a single character string putting ss in between, e.g., +ss <- "|". There are more tools for character manipulation, see the help +for sub and substring.

+

(10)

+

numeric mode is +actually an amalgam of two distinct modes, namely integer and +double precision, as explained in the manual.

+

(11)

+

Note however that length(object) does not always +contain intrinsic useful information, e.g., when object is a +function.

+

(12)

+

In general, coercion +from numeric to character and back again will not be exactly reversible, +because of roundoff errors in the character representation.

+

(13)

+

A different style using +‘formal’ or ‘S4’ classes is provided in package methods.

+

(14)

+

Readers should note +that there are eight states and territories in Australia, namely the +Australian Capital Territory, New South Wales, the Northern Territory, +Queensland, South Australia, Tasmania, Victoria and Western Australia.

+

(15)

+

Note that tapply() also works in this case +when its second argument is not a factor, e.g., +‘tapply(incomes, state)’, and this is true for quite a few +other functions, since arguments are coerced to factors when +necessary (using as.factor()).

+

(16)

+

Note that x %*% x is ambiguous, as +it could mean either x’x or x x’, where x is the +column form. In such cases the smaller matrix seems implicitly to be +the interpretation adopted, so the scalar x’x is in this case the +result. The matrix x x’ may be calculated either by cbind(x) +%*% x or x %*% rbind(x) since the result of rbind() or +cbind() is always a matrix. However, the best way to compute +x’x or x x’ is crossprod(x) or x %o% x respectively.

+

(17)

+

Even better would be to form a matrix square +root B with A = BB’ and find the squared length +of the solution of By = x , perhaps using the Cholesky or +eigen decomposition of A.

+

(18)

+

Conversion of character columns to factors is +overridden using the stringsAsFactors argument to the +data.frame() function.

+

(19)

+

See the on-line help +for autoload for the meaning of the second term.

+

(20)

+

Under UNIX, the utilities +sed orawk can be used.

+

(21)

+

to be +discussed later, or use xyplot from package lattice.

+

(22)

+

See also the methods described in Statistical models in R

+

(23)

+

In some sense this +mimics the behavior in S-PLUS since in S-PLUS this operator always +creates or assigns to a global variable.

+

(24)

+

So it is hidden under +UNIX.

+

(25)

+

Some graphics +parameters such as the size of the current device are for information +only.

+

(26)

+

2.5Gb on versions of Windows that support 3Gb per +process and have the support enabled: see the rw-FAQ Q2.9; 3.5Gb +on most 64-bit versions of Windows.

+

(27)

+

The +‘Emacs Speaks Statistics’ package; see the URL +http://ESS.R-project.org

+

(28)

+

On a PC keyboard this is usually the +Alt key, occasionally the ‘Windows’ key. On a Mac keyboard normally no +meta key is available.

+
+
+ + + + + diff --git a/R-ints.html b/R-ints.html new file mode 100644 index 0000000..2abe19c --- /dev/null +++ b/R-ints.html @@ -0,0 +1,5791 @@ + + + + + +R Internals + + + + + + + + + + + + + + + + +

R Internals

+ + + + + + + + + + + + + + + + + + + + + + + +

Table of Contents

+ +
+ + +
+ + + + + +

R Internals

+ +

This is a guide to the internal structures of R and coding standards for +the core team working on R itself. +

+

This manual is for R, version 3.2.3 (2015-12-10). +

+

Copyright © 1999–2015 R Core Team +

+
+

Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. +

+

Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided that the +entire resulting derived work is distributed under the terms of a +permission notice identical to this one. +

+

Permission is granted to copy and distribute translations of this manual +into another language, under the above conditions for modified versions, +except that this permission notice may be stated in a translation +approved by the R Core Team. +

+ + + + + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

1 R Internal Structures

+ +

This chapter is the beginnings of documentation about R internal +structures. It is written for the core team and others studying the +code in the src/main directory. +

+

It is a work-in-progress and should be checked against the current +version of the source code. Versions for R 2.x.y contain historical +comments about when features were introduced: this version is for the +3.x.y series. +

+ + + + + + + + + + + + + + + + + + + + +
+ + + +

1.1 SEXPs

+ + + +

What R users think of as variables or objects are +symbols which are bound to a value. The value can be thought of as +either a SEXP (a pointer), or the structure it points to, a +SEXPREC (and there are alternative forms used for vectors, namely +VECSXP pointing to VECTOR_SEXPREC structures). +So the basic building blocks of R objects are often called +nodes, meaning SEXPRECs or VECTOR_SEXPRECs. +

+

Note that the internal structure of the SEXPREC is not made +available to R Extensions: rather SEXP is an opaque pointer, +and the internals can only be accessed by the functions provided. +

+ +

Both types of node structure have as their first three fields a 32-bit +sxpinfo header and then three pointers (to the attributes and the +previous and next node in a doubly-linked list), and then some further +fields. On a 32-bit platform a node1 occupies 28 bytes: on a 64-bit platform typically 56 +bytes (depending on alignment constraints). +

+

The first five bits of the sxpinfo header specify one of up to 32 +SEXPTYPEs. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: SEXPs   [Contents][Index]

+
+ +

1.1.1 SEXPTYPEs

+ + +

Currently SEXPTYPEs 0:10 and 13:25 are in use. Values 11 and 12 +were used for internal factors and ordered factors and have since been +withdrawn. Note that the SEXPTYPE numbers are stored in +saved objects and that the ordering of the types is used, so the +gap cannot easily be reused. +

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
noSEXPTYPEDescription
0NILSXPNULL
1SYMSXPsymbols
2LISTSXPpairlists
3CLOSXPclosures
4ENVSXPenvironments
5PROMSXPpromises
6LANGSXPlanguage objects
7SPECIALSXPspecial functions
8BUILTINSXPbuiltin functions
9CHARSXPinternal character strings
10LGLSXPlogical vectors
13INTSXPinteger vectors
14REALSXPnumeric vectors
15CPLXSXPcomplex vectors
16STRSXPcharacter vectors
17DOTSXPdot-dot-dot object
18ANYSXPmake “any” args work
19VECSXPlist (generic vector)
20EXPRSXPexpression vector
21BCODESXPbyte code
22EXTPTRSXPexternal pointer
23WEAKREFSXPweak reference
24RAWSXPraw vector
25S4SXPS4 classes not of simple type
+
+ + +

Many of these will be familiar from R level: the atomic vector types +are LGLSXP, INTSXP, REALSXP, CPLXSP, +STRSXP and RAWSXP. Lists are VECSXP and names +(also known as symbols) are SYMSXP. Pairlists (LISTSXP, +the name going back to the origins of R as a Scheme-like language) +are rarely seen at R level, but are for example used for argument +lists. Character vectors are effectively lists all of whose elements +are CHARSXP, a type that is rarely visible at R level. +

+ + +

Language objects (LANGSXP) are calls (including formulae and so +on). Internally they are pairlists with first element a +reference2 to the function to be called with remaining elements the +actual arguments for the call (and with the tags if present giving the +specified argument names). Although this is not enforced, many places +in the code assume that the pairlist is of length one or more, often +without checking. +

+ +

Expressions are of type EXPRSXP: they are a vector of (usually +language) objects most often seen as the result of parse(). +

+ +

The functions are of types CLOSXP, SPECIALSXP and +BUILTINSXP: where SEXPTYPEs are stored in an integer +these are sometimes lumped into a pseudo-type FUNSXP with code +99. Functions defined via function are of type CLOSXP and +have formals, body and environment. +

+ +

The SEXPTYPE S4SXP is for S4 objects which do not consist +solely of a simple type such as an atomic vector or function. +

+ +
+ +
+

+Next: , Previous: , Up: SEXPs   [Contents][Index]

+
+ +

1.1.2 Rest of header

+ +

The sxpinfo header is defined as a 32-bit C structure by +

+
+
struct sxpinfo_struct {
+    SEXPTYPE type      :  5;  /* discussed above */
+    unsigned int obj   :  1;  /* is this an object with a class attribute? */
+    unsigned int named :  2;  /* used to control copying */
+    unsigned int gp    : 16;  /* general purpose, see below */
+    unsigned int mark  :  1;  /* mark object as ‘in use’ in GC */
+    unsigned int debug :  1;
+    unsigned int trace :  1;
+    unsigned int spare :  1;  /* debug once */
+    unsigned int gcgen :  1;  /* generation for GC */
+    unsigned int gccls :  3;  /* class of node for GC */
+};  /*              Tot: 32 */
+
+ + +

The debug bit is used for closures and environments. For +closures it is set by debug() and unset by undebug(), and +indicates that evaluations of the function should be run under the +browser. For environments it indicates whether the browsing is in +single-step mode. +

+ +

The trace bit is used for functions for trace() and for +other objects when tracing duplications (see tracemem). +

+ +

The spare bit is used for closures to mark them for one +time debugging. +

+ + + + +

The named field is set and accessed by the SET_NAMED and +NAMED macros, and take values 0, 1 and 2. +R has a ‘call by value’ illusion, so an assignment like +

+
b <- a
+
+ +

appears to make a copy of a and refer to it as b. +However, if neither a nor b are subsequently altered there +is no need to copy. What really happens is that a new symbol b +is bound to the same value as a and the named field on the +value object is set (in this case to 2). When an object is about +to be altered, the named field is consulted. A value of 2 +means that the object must be duplicated before being changed. (Note +that this does not say that it is necessary to duplicate, only that it +should be duplicated whether necessary or not.) A value of 0 +means that it is known that no other SEXP shares data with this +object, and so it may safely be altered. A value of 1 is used +for situations like +

+
+
dim(a) <- c(7, 2)
+
+ +

where in principle two copies of a exist for the duration of the +computation as (in principle) +

+
+
a <- `dim<-`(a, c(7, 2))
+
+ +

but for no longer, and so some primitive functions can be optimized to +avoid a copy in this case. +

+

The gp bits are by definition ‘general purpose’. We label these +from 0 to 15. Bits 0–5 and bits 14–15 have been used as described below +(mainly from detective work on the sources). +

+ + + +

The bits can be accessed and set by the LEVELS and +SETLEVELS macros, which names appear to date back to the internal +factor and ordered types and are now used in only a few places in the +code. The gp field is serialized/unserialized for the +SEXPTYPEs other than NILSXP, SYMSXP and +ENVSXP. +

+

Bits 14 and 15 of gp are used for ‘fancy bindings’. Bit 14 is +used to lock a binding or an environment, and bit 15 is used to indicate +an active binding. (For the definition of an ‘active binding’ see the +header comments in file src/main/envir.c.) Bit 15 is used for an +environment to indicate if it participates in the global cache. +

+ + +

The macros ARGUSED and SET_ARGUSED are used when matching +actual and formal function arguments, and take the values 0, 1 and 2. +

+ + +

The macros MISSING and SET_MISSING are used for pairlists +of arguments. Four bits are reserved, but only two are used (and +exactly what for is not explained). It seems that bit 0 is used by +matchArgs to mark missingness on the returned argument list, and +bit 1 is used to mark the use of a default value for an argument copied +to the evaluation frame of a closure. +

+ + + +

Bit 0 is used by macros DDVAL and SET_DDVAL. This +indicates that a SYMSXP is one of the symbols ..n which +are implicitly created when ... is processed, and so indicates +that it may need to be looked up in a DOTSXP. +

+ + +

Bit 0 is used for PRSEEN, a flag to indicate if a promise has +already been seen during the evaluation of the promise (and so to avoid +recursive loops). +

+

Bit 0 is used for HASHASH, on the PRINTNAME of the +TAG of the frame of an environment. (This bit is not serialized +for CHARSXP objects.) +

+

Bits 0 and 1 are used for weak references (to indicate ‘ready to +finalize’, ‘finalize on exit’). +

+

Bit 0 is used by the condition handling system (on a VECSXP) to +indicate a calling handler. +

+

Bit 4 is turned on to mark S4 objects. +

+

Bits 1, 2, 3, 5 and 6 are used for a CHARSXP to denote its +encoding. Bit 1 indicates that the CHARSXP should be treated as +a set of bytes, not necessarily representing a character in any known +encoding. Bits 2, 3 and 6 are used to indicate that it is known to be +in Latin-1, UTF-8 or ASCII respectively. +

+

Bit 5 for a CHARSXP indicates that it is hashed by its address, +that is NA_STRING or is in the CHARSXP cache (this is not +serialized). Only exceptionally is a CHARSXP not hashed, and +this should never happen in end-user code. +

+
+ +
+

+Next: , Previous: , Up: SEXPs   [Contents][Index]

+
+ +

1.1.3 The ‘data’

+ +

A SEXPREC is a C structure containing the 32-bit header as +described above, three pointers (to the attributes, previous and next +node) and the node data, a union +

+
+
union {
+    struct primsxp_struct primsxp;
+    struct symsxp_struct symsxp;
+    struct listsxp_struct listsxp;
+    struct envsxp_struct envsxp;
+    struct closxp_struct closxp;
+    struct promsxp_struct promsxp;
+} u;
+
+ +

All of these alternatives apart from the first (an int) are three +pointers, so the union occupies three words. +

+ +

The vector types are RAWSXP, CHARSXP, LGLSXP, +INTSXP, REALSXP, CPLXSXP, STRSXP, +VECSXP, EXPRSXP and WEAKREFSXP. Remember that such +types are a VECTOR_SEXPREC, which again consists of the header +and the same three pointers, but followed by two integers giving the +length and ‘true length’3 of the vector, and then followed by the data (aligned as +required: on most 32-bit systems with a 24-byte VECTOR_SEXPREC +node the data can follow immediately after the node). The data are a +block of memory of the appropriate length to store ‘true length’ +elements (rounded up to a multiple of 8 bytes, with the 8-byte blocks +being the ‘Vcells’ referred in the documentation for gc()). +

+

The ‘data’ for the various types are given in the table below. A lot of +this is interpretation, i.e. the types are not checked. +

+
+
NILSXP
+

There is only one object of type NILSXP, R_NilValue, with +no data. +

+
+
SYMSXP
+

Pointers to three nodes, the name, value and internal, accessed by +PRINTNAME (a CHARSXP), SYMVALUE and +INTERNAL. (If the symbol’s value is a .Internal function, +the last is a pointer to the appropriate SEXPREC.) Many symbols +have SYMVALUE R_UnboundValue. +

+
+
LISTSXP
+

Pointers to the CAR, CDR (usually a LISTSXP or NULL) and +TAG (a SYMSXP or NULL). +

+
+
CLOSXP
+

Pointers to the formals (a pairlist), the body and the environment. +

+
+
ENVSXP
+

Pointers to the frame, enclosing environment and hash table (NULL or a +VECSXP). A frame is a tagged pairlist with tag the symbol and +CAR the bound value. +

+
+
PROMSXP
+

Pointers to the value, expression and environment (in which to evaluate +the expression). Once an promise has been evaluated, the environment is +set to NULL. +

+
+
LANGSXP
+

A special type of LISTSXP used for function calls. (The CAR +references the function (perhaps via a symbol or language object), and +the CDR the argument list with tags for named arguments.) R-level +documentation references to ‘expressions’ / ‘language objects’ are +mainly LANGSXPs, but can be symbols (SYMSXPs) or +expression vectors (EXPRSXPs). +

+
+
SPECIALSXP
+
BUILTINSXP
+

An integer giving the offset into the table of +primitives/.Internals. +

+
+
CHARSXP
+

length, truelength followed by a block of bytes (allowing +for the nul terminator). +

+
+
LGLSXP
+
INTSXP
+

length, truelength followed by a block of C ints +(which are 32 bits on all R platforms). +

+
+
REALSXP
+

length, truelength followed by a block of C doubles. +

+
+
CPLXSXP
+

length, truelength followed by a block of C99 double +complexs. +

+
+
STRSXP
+

length, truelength followed by a block of pointers +(SEXPs pointing to CHARSXPs). +

+
+
DOTSXP
+

A special type of LISTSXP for the value bound to a ... +symbol: a pairlist of promises. +

+
+
ANYSXP
+

This is used as a place holder for any type: there are no actual objects +of this type. +

+
+
VECSXP
+
EXPRSXP
+

length, truelength followed by a block of pointers. These +are internally identical (and identical to STRSXP) but differ in +the interpretations placed on the elements. +

+
+
BCODESXP
+

For the ‘byte-code’ objects generated by the compiler. +

+
+
EXTPTRSXP
+

Has three pointers, to the pointer, the protection value (an R object +which if alive protects this object) and a tag (a SYMSXP?). +

+
+
WEAKREFSXP
+

A WEAKREFSXP is a special VECSXP of length 4, with +elements ‘key’, ‘value’, ‘finalizer’ and ‘next’. +The ‘key’ is NULL, an environment or an external pointer, +and the ‘finalizer’ is a function or NULL. +

+
+
RAWSXP
+

length, truelength followed by a block of bytes. +

+
+
S4SXP
+

two unused pointers and a tag. +

+
+ +
+ +
+

+Previous: , Up: SEXPs   [Contents][Index]

+
+ +

1.1.4 Allocation classes

+ + +

As we have seen, the field gccls in the header is three bits to +label up to 8 classes of nodes. Non-vector nodes are of class 0, and +‘small’ vector nodes are of classes 1 to 5, with a class for custom +allocator vector nodes 6 and ‘large’ vector nodes being of class 7. The +‘small’ vector nodes are able to store vector data of up to 8, 16, 32, +64 and 128 bytes: larger vectors are malloc-ed individually +whereas the ‘small’ nodes are allocated from pages of about 2000 +bytes. Vector nodes allocated using custom allocators (via +allocVector3) are not counted in the gc memory usage statistics +since their memory semantics is not under R’s control and may be +non-standard (e.g., memory could be partially shared across nodes). +

+ +
+ +
+

+Next: , Previous: , Up: R Internal Structures   [Contents][Index]

+
+ +

1.2 Environments and variable lookup

+ + + +

What users think of as ‘variables’ are symbols which are bound to +objects in ‘environments’. The word ‘environment’ is used ambiguously +in R to mean either the frame of an ENVSXP (a pairlist +of symbol-value pairs) or an ENVSXP, a frame plus an +enclosure. +

+ +

There are additional places that ‘variables’ can be looked up, called +‘user databases’ in comments in the code. These seem undocumented in +the R sources, but apparently refer to the RObjectTable package +at http://www.omegahat.org/RObjectTables/. +

+ + +

The base environment is special. There is an ENVSXP environment +with enclosure the empty environment R_EmptyEnv, but the frame of +that environment is not used. Rather its bindings are part of the +global symbol table, being those symbols in the global symbol table +whose values are not R_UnboundValue. When R is started the +internal functions are installed (by C code) in the symbol table, with +primitive functions having values and .Internal functions having +what would be their values in the field accessed by the INTERNAL +macro. Then .Platform and .Machine are computed and the +base package is loaded into the base environment followed by the system +profile. +

+

The frames of environments (and the symbol table) are normally hashed +for faster access (including insertion and deletion). +

+

By default R maintains a (hashed) global cache of ‘variables’ (that +is symbols and their bindings) which have been found, and this refers +only to environments which have been marked to participate, which +consists of the global environment (aka the user workspace), the base +environment plus environments4 which have been attached. When an environment is either +attached or detached, the names of its symbols are flushed +from the cache. The cache is used whenever searching for variables from +the global environment (possibly as part of a recursive search). +

+ + + + + + +
+ + + +

1.2.1 Search paths

+ + +

S has the notion of a ‘search path’: the lookup for a ‘variable’ +leads (possibly through a series of frames) to the ‘session frame’ the +‘working directory’ and then along the search path. The search path is +a series of databases (as returned by search()) which contain the +system functions (but not necessarily at the end of the path, as by +default the equivalent of packages are added at the end). +

+

R has a variant on the S model. There is a search path (also +returned by search()) which consists of the global environment +(aka user workspace) followed by environments which have been attached +and finally the base environment. Note that unlike S it is not +possible to attach environments before the workspace nor after the base +environment. +

+

However, the notion of variable lookup is more general in R, hence +the plural in the title of this subsection. Since environments have +enclosures, from any environment there is a search path found by looking +in the frame, then the frame of its enclosure and so on. Since loops +are not allowed, this process will eventually terminate: it can +terminate at either the base environment or the empty environment. (It +can be conceptually simpler to think of the search always terminating at +the empty environment, but with an optimization to stop at the base +environment.) So the ‘search path’ describes the chain of environments +which is traversed once the search reaches the global environment. +

+
+ + + +

1.2.2 Namespaces

+ + +

Namespaces are environments associated with packages (and once again +the base package is special and will be considered separately). A +package pkg with a namespace defines two environments +namespace:pkg and package:pkg: it is +package:pkg that can be attached and form part of +the search path. +

+

The objects defined by the R code in the package are symbols with +bindings in the namespace:pkg environment. The +package:pkg environment is populated by selected symbols +from the namespace:pkg environment (the exports). The +enclosure of this environment is an environment populated with the +explicit imports from other namespaces, and the enclosure of +that environment is the base namespace. (So the illusion of the +imports being in the namespace environment is created via the +environment tree.) The enclosure of the base namespace is the global +environment, so the search from a package namespace goes via the +(explicit and implicit) imports to the standard ‘search path’. +

+ + + +

The base namespace environment R_BaseNamespace is another +ENVSXP that is special-cased. It is effectively the same thing +as the base environment R_BaseEnv except that its +enclosure is the global environment rather than the empty environment: +the internal code diverts lookups in its frame to the global symbol +table. +

+
+ + + +

1.2.3 Hash table

+ +

Environments in R usually have a hash table, and nowadays that is the +default in new.env(). It is stored as a VECSXP where +length is used for the allocated size of the table and +truelength is the number of primary slots in use—the pointer to +the VECSXP is part of the header of a SEXP of type +ENVSXP, and this points to R_NilValue if the environment +is not hashed. +

+

For the pros and cons of hashing, see a basic text on Computer Science. +

+

The code to implement hashed environments is in src/main/envir.c. +Unless set otherwise (e.g. by the size argument of +new.env()) the initial table size is 29. The table will +be resized by a factor of 1.2 once the load factor (the proportion of +primary slots in use) reaches 85%. +

+

The hash chains are stored as pairlist elements of the VECSXP: +items are inserted at the front of the pairlist. Hashing is principally +designed for fast searching of environments, which are from time to time +added to but rarely deleted from, so items are not actually deleted but +have their value set to R_UnboundValue. +

+ +
+ + + +

1.3 Attributes

+ + + + + +

As we have seen, every SEXPREC has a pointer to the attributes of +the node (default R_NilValue). The attributes can be +accessed/set by the macros/functions ATTRIB and +SET_ATTRIB, but such direct access is normally only used to check +if the attributes are NULL or to reset them. Otherwise access +goes through the functions getAttrib and setAttrib which +impose restrictions on the attributes. One thing to watch is that if +you copy attributes from one object to another you may (un)set the +"class" attribute and so need to copy the object and S4 bits as +well. There is a macro/function DUPLICATE_ATTRIB to automate +this. +

+

Note that the ‘attributes’ of a CHARSXP are used as part of the +management of the CHARSXP cache: of course CHARSXP’s are +not user-visible but C-level code might look at their attributes. +

+

The code assumes that the attributes of a node are either +R_NilValue or a pairlist of non-zero length (and this is checked +by SET_ATTRIB). The attributes are named (via tags on the +pairlist). The replacement function attributes<- ensures that +"dim" precedes "dimnames" in the pairlist. Attribute +"dim" is one of several that is treated specially: the values are +checked, and any "names" and "dimnames" attributes are +removed. Similarly, you cannot set "dimnames" without having set +"dim", and the value assigned must be a list of the correct +length and with elements of the correct lengths (and all zero-length +elements are replaced by NULL). +

+

The other attributes which are given special treatment are +"names", "class", "tsp", "comment" and +"row.names". For pairlist-like objects the names are not stored +as an attribute but (as symbols) as the tags: however the R interface +makes them look like conventional attributes, and for one-dimensional +arrays they are stored as the first element of the "dimnames" +attribute. The C code ensures that the "tsp" attribute is an +REALSXP, the frequency is positive and the implied length agrees +with the number of rows of the object being assigned to. Classes and +comments are restricted to character vectors, and assigning a +zero-length comment or class removes the attribute. Setting or removing +a "class" attribute sets the object bit appropriately. Integer +row names are converted to and from the internal compact representation. +

+ +

Care needs to be taken when adding attributes to objects of the types +with non-standard copying semantics. There is only one object of type +NILSXP, R_NilValue, and that should never have attributes +(and this is enforced in installAttrib). For environments, +external pointers and weak references, the attributes should be relevant +to all uses of the object: it is for example reasonable to have a name +for an environment, and also a "path" attribute for those +environments populated from R code in a package. +

+ + +

When should attributes be preserved under operations on an object? +Becker, Chambers & Wilks (1988, pp. 144–6) give some guidance. Scalar +functions (those which operate element-by-element on a vector and whose +output is similar to the input) should preserve attributes (except +perhaps class, and if they do preserve class they need to preserve the +OBJECT and S4 bits). Binary operations normally call + +copyMostAttributes to copy most attributes from the longer +argument (and if they are of the same length from both, preferring the +values on the first). Here ‘most’ means all except the names, +dim and dimnames which are set appropriately by the code +for the operator. +

+

Subsetting (other than by an empty index) generally drops all attributes +except names, dim and dimnames which are reset as +appropriate. On the other hand, subassignment generally preserves such +attributes even if the length is changed. Coercion drops all +attributes. For example: +

+
+
> x <- structure(1:8, names=letters[1:8], comm="a comment")
+> x[]
+a b c d e f g h
+1 2 3 4 5 6 7 8
+attr(,"comm")
+[1] "a comment"
+> x[1:3]
+a b c
+1 2 3
+> x[3] <- 3
+> x
+a b c d e f g h
+1 2 3 4 5 6 7 8
+attr(,"comm")
+[1] "a comment"
+> x[9] <- 9
+> x
+a b c d e f g h
+1 2 3 4 5 6 7 8 9
+attr(,"comm")
+[1] "a comment"
+
+ + +
+ + + +

1.4 Contexts

+ + +

Contexts are the internal mechanism used to keep track of where a +computation has got to (and from where), so that control-flow constructs +can work and reasonable information can be produced on error conditions +(such as via traceback), and otherwise (the sys.xxx +functions). +

+

Execution contexts are a stack of C structs: +

+
+
typedef struct RCNTXT {
+    struct RCNTXT *nextcontext; /* The next context up the chain */
+    int callflag;               /* The context ‘type’ */
+    JMP_BUF cjmpbuf;            /* C stack and register information */
+    int cstacktop;              /* Top of the pointer protection stack */
+    int evaldepth;              /* Evaluation depth at inception */
+    SEXP promargs;              /* Promises supplied to closure */
+    SEXP callfun;               /* The closure called */
+    SEXP sysparent;             /* Environment the closure was called from */
+    SEXP call;                  /* The call that effected this context */
+    SEXP cloenv;                /* The environment */
+    SEXP conexit;               /* Interpreted on.exit code */
+    void (*cend)(void *);       /* C on.exit thunk */
+    void *cenddata;             /* Data for C on.exit thunk */
+    char *vmax;                 /* Top of the R_alloc stack */
+    int intsusp;                /* Interrupts are suspended */
+    SEXP handlerstack;          /* Condition handler stack */
+    SEXP restartstack;          /* Stack of available restarts */
+    struct RPRSTACK *prstack;   /* Stack of pending promises */
+} RCNTXT, *context;
+
+ +

plus additional fields for the byte-code compiler. The ‘types’ +are from +

+
+
enum {
+    CTXT_TOPLEVEL = 0,  /* toplevel context */
+    CTXT_NEXT     = 1,  /* target for next */
+    CTXT_BREAK    = 2,  /* target for break */
+    CTXT_LOOP     = 3,  /* break or next target */
+    CTXT_FUNCTION = 4,  /* function closure */
+    CTXT_CCODE    = 8,  /* other functions that need error cleanup */
+    CTXT_RETURN   = 12, /* return() from a closure */
+    CTXT_BROWSER  = 16, /* return target on exit from browser */
+    CTXT_GENERIC  = 20, /* rather, running an S3 method */
+    CTXT_RESTART  = 32, /* a call to restart was made from a closure */
+    CTXT_BUILTIN  = 64  /* builtin internal function */
+};
+
+ +

where the CTXT_FUNCTION bit is on wherever function closures are +involved. +

+

Contexts are created by a call to begincontext and ended by a +call to endcontext: code can search up the stack for a +particular type of context via findcontext (and jump there) or +jump to a specific context via R_JumpToContext. +R_ToplevelContext is the ‘idle’ state (normally the command +prompt), and R_GlobalContext is the top of the stack. +

+

Note that whilst calls to closures and builtins set a context, those to special +internal functions never do. +

+ + +

Dispatching from a S3 generic (via UseMethod or its internal +equivalent) or calling NextMethod sets the context type to +CTXT_GENERIC. This is used to set the sysparent of the +method call to that of the generic, so the method appears to have +been called in place of the generic rather than from the generic. +

+

The R sys.frame and sys.call functions work by counting +calls to closures (type CTXT_FUNCTION) from either end of the +context stack. +

+

Note that the sysparent element of the structure is not the same +thing as sys.parent(). Element sysparent is primarily +used in managing changes of the function being evaluated, i.e. by +Recall and method dispatch. +

+

CTXT_CCODE contexts are currently used in cat(), +load(), scan() and write.table() (to close the +connection on error), by PROTECT, serialization (to recover from +errors, e.g. free buffers) and within the error handling code (to +raise the C stack limit and reset some variables). +

+ +
+ +
+

+Next: , Previous: , Up: R Internal Structures   [Contents][Index]

+
+ +

1.5 Argument evaluation

+ + +

As we have seen, functions in R come in three types, closures +(SEXPTYPE CLOSXP), specials (SPECIALSXP) and +builtins (BUILTINSXP). In this section we consider when (and if) +the actual arguments of function calls are evaluated. The rules are +different for the internal (special/builtin) and R-level functions +(closures). +

+

For a call to a closure, the actual and formal arguments are matched and +a matched call (another LANGSXP) is constructed. This process +first replaces the actual argument list by a list of promises to the +values supplied. It then constructs a new environment which contains +the names of the formal parameters matched to actual or default values: +all the matched values are promises, the defaults as promises to be +evaluated in the environment just created. That environment is then +used for the evaluation of the body of the function, and promises will +be forced (and hence actual or default arguments evaluated) when they +are encountered. + +(Evaluating a promise sets NAMED = 2 on its value, so if the +argument was a symbol its binding is regarded as having multiple +references during the evaluation of the closure call.) +

+

If the closure is an S3 generic (that is, contains a call to +UseMethod) the evaluation process is the same until the +UseMethod call is encountered. At that point the argument on +which to do dispatch (normally the first) will be evaluated if it has +not been already. If a method has been found which is a closure, a new +evaluation environment is created for it containing the matched +arguments of the method plus any new variables defined so far during the +evaluation of the body of the generic. (Note that this means changes to +the values of the formal arguments in the body of the generic are +discarded when calling the method, but actual argument promises +which have been forced retain the values found when they were forced. +On the other hand, missing arguments have values which are promises to +use the default supplied by the method and not by the generic.) If the +method found is a primitive it is called with the matched argument list +of promises (possibly already forced) used for the generic. +

+ + + + +

The essential difference5 between special and builtin functions is +that the arguments of specials are not evaluated before the C code is +called, and those of builtins are. Note that being a special/builtin is +separate from being primitive or .Internal: quote is a +special primitive, + is a builtin primitive, cbind is a +special .Internal and grep is a builtin .Internal. +

+ + +

Many of the internal functions are internal generics, which for specials +means that they do not evaluate their arguments on call, but the C code +starts with a call to DispatchOrEval. The latter evaluates the +first argument, and looks for a method based on its class. (If S4 +dispatch is on, S4 methods are looked for first, even for S3 classes.) +If it finds a method, it dispatches to that method with a call based on +promises to evaluate the remaining arguments. If no method is found, +the remaining arguments are evaluated before return to the internal +generic. +

+ + +

The other way that internal functions can be generic is to be group +generic. Most such functions are builtins (so immediately evaluate all +their arguments), and all contain a call to the C function +DispatchGeneric. There are some peculiarities over the number of +arguments for the "Math" group generic, with some members +allowing only one argument, some having two (with a default for the +second) and trunc allows one or more but the default method only +accepts one. +

+ + + + + +
+ + + +

1.5.1 Missingness

+ + +

Actual arguments to (non-internal) R functions can be fewer than are +required to match the formal arguments of the function. Having +unmatched formal arguments will not matter if the argument is never used +(by lazy evaluation), but when the argument is evaluated, either its +default value is evaluated (within the evaluation environment of the +function) or an error is thrown with a message along the lines of +

+
+
argument "foobar" is missing, with no default
+
+ + + +

Internally missingness is handled by two mechanisms. The object +R_MissingArg is used to indicate that a formal argument has no +(default) value. When matching the actual arguments to the formal +arguments, a new argument list is constructed from the formals all of +whose values are R_MissingArg with the first MISSING bit +set. Then whenever a formal argument is matched to an actual argument, +the corresponding member of the new argument list has its value set to +that of the matched actual argument, and if that is not +R_MissingArg the missing bit is unset. +

+

This new argument list is used to form the evaluation frame for the +function, and if named arguments are subsequently given a new value +(before they are evaluated) the missing bit is cleared. +

+

Missingness of arguments can be interrogated via the missing() +function. An argument is clearly missing if its missing bit is set or +if the value is R_MissingArg. However, missingness can be passed +on from function to function, for using a formal argument as an actual +argument in a function call does not count as evaluation. So +missing() has to examine the value (a promise) of a +non-yet-evaluated formal argument to see if it might be missing, which +might involve investigating a promise and so on …. +

+

Special primitives also need to handle missing arguments, and in some +case (e.g. log) that is why they are special and not +builtin. This is usually done by testing if an argument’s value is +R_MissingArg. +

+
+ +
+

+Previous: , Up: Argument evaluation   [Contents][Index]

+
+ +

1.5.2 Dot-dot-dot arguments

+ + +

Dot-dot-dot arguments are convenient when writing functions, but +complicate the internal code for argument evaluation. +

+

The formals of a function with a ... argument represent that as a +single argument like any other argument, with tag the symbol +R_DotsSymbol. When the actual arguments are matched to the +formals, the value of the ... argument is of SEXPTYPE +DOTSXP, a pairlist of promises (as used for matched arguments) +but distinguished by the SEXPTYPE. +

+

Recall that the evaluation frame for a function initially contains the +name=value pairs from the matched call, and hence +this will be true for ... as well. The value of ... is a +(special) pairlist whose elements are referred to by the special symbols +..1, ..2, … which have the DDVAL bit set: +when one of these is encountered it is looked up (via ddfindVar) +in the value of the ... symbol in the evaluation frame. +

+

Values of arguments matched to a ... argument can be missing. +

+

Special primitives may need to handle ... arguments: see for +example the internal code of switch in file +src/main/builtin.c. +

+
+ + + +

1.6 Autoprinting

+ + + + +

Whether the returned value of a top-level R expression is printed is +controlled by the global boolean variable R_Visible. This is set +(to true or false) on entry to all primitive and internal functions +based on the eval column of the table in file +src/main/names.c: the appropriate setting can be extracted by the +macro PRIMPRINT. + +

+ +

The R primitive function invisible makes use of this +mechanism: it just sets R_Visible = FALSE before entry and +returns its argument. +

+

For most functions the intention will be that the setting of +R_Visible when they are entered is the setting used when they +return, but there need to be exceptions. The R functions +identify, options, system and writeBin +determine whether the result should be visible from the arguments or +user action. Other functions themselves dispatch functions which may +change the visibility flag: examples6 are +.Internal, do.call, eval, withVisible, +if, NextMethod, Recall, recordGraphics, +standardGeneric, switch and UseMethod. +

+

‘Special’ primitive and internal functions evaluate their arguments +internally after R_Visible has been set, and evaluation of +the arguments (e.g. an assignment as in PR#9263)) can change the value +of the flag. +

+

The R_Visible flag can also get altered during the evaluation of +a function, with comments in the code about warning, +writeChar and graphics functions calling GText (PR#7397). +(Since the C-level function eval sets R_Visible, this +could apply to any function calling it. Since it is called when +evaluating promises, even object lookup can change R_Visible.) +Internal and primitive functions force the documented setting of +R_Visible on return, unless the C code is allowed to change it +(the exceptions above are indicated by PRIMPRINT having value 2). +

+

The actual autoprinting is done by PrintValueEnv in file +print.c. If the object to be printed has the S4 bit set and S4 +methods dispatch is on, show is called to print the object. +Otherwise, if the object bit is set (so the object has a +"class" attribute), print is called to dispatch methods: +for objects without a class the internal code of print.default +is called. +

+ +
+ + + +

1.7 The write barrier and the garbage collector

+ + + +

R has long had a generational garbage collector, and bit gcgen +in the sxpinfo header is used in the implementation of this. +This is used in conjunction with the mark bit to identify two +previous generations. +

+

There are three levels of collections. Level 0 collects only the +youngest generation, level 1 collects the two youngest generations and +level 2 collects all generations. After 20 level-0 collections the next +collection is at level 1, and after 5 level-1 collections at level 2. +Further, if a level-n collection fails to provide 20% free space +(for each of nodes and the vector heap), the next collection will be at +level n+1. (The R-level function gc() performs a +level-2 collection.) +

+

A generational collector needs to efficiently ‘age’ the objects, +especially list-like objects (including STRSXPs). This is done +by ensuring that the elements of a list are regarded as at least as old +as the list when they are assigned. This is handled by the +functions SET_VECTOR_ELT and SET_STRING_ELT, which is why +they are functions and not macros. Ensuring the integrity of such +operations is termed the write barrier and is done by making the +SEXP opaque and only providing access via functions (which cannot +be used as lvalues in assignments in C). +

+

All code in R extensions is by default behind the write barrier. The +only way to obtain direct access to the internals of the SEXPRECs +is to define ‘USE_RINTERNALS’ before including header file +Rinternals.h, which is normally defined in Defn.h. To +enable a check on the way that the access is used, R can be compiled +with flag --enable-strict-barrier which ensures that header +Defn.h does not define ‘USE_RINTERNALS’ and hence that +SEXP is opaque in most of R itself. (There are some necessary +exceptions: foremost in file memory.c where the accessor +functions are defined and also in file size.c which needs access +to the sizes of the internal structures.) +

+

For background papers see +http://homepage.stat.uiowa.edu/~luke/R/barrier.html and +http://homepage.stat.uiowa.edu/~luke/R/gengcnotes.html. +

+
+ + + +

1.8 Serialization Formats

+ + +

Serialized versions of R objects are used by load/save +and also at a slightly lower level by saveRDS/readRDS (and +their earlier ‘internal’ dot-name versions) and +serialize/unserialize. These differ in what they +serialize to (a file, a connection, a raw vector) and whether they are +intended to serialize a single object or a collection of objects +(typically the workspace). save writes a header at the beginning +of the file (a single LF-terminated line) which the lower-level versions +do not. +

+

save and saveRDS allow various forms of compression, and +gzip compression is the default (except for ASCII +saves). Compression is applied to the whole file stream, including the +headers, so serialized files can be uncompressed or re-compressed by +external programs. Both load and readRDS can read +gzip, bzip2 and xz forms of compression +when reading from a file, and gzip compression when reading +from a connection. +

+

R has used the same serialization format since R 1.4.0 in December +2001. Earlier formats are still supported via load and +save but such formats are not described here. The current +serialization format is called ‘version 2’, and has been expanded in +back-compatible ways since its inception, for example to support +additional SEXPTYPEs. +

+

save works by writing a single-line header (typically +RDX2\n for a binary save: the only other current value is +RDA2\n for save(files=TRUE)), then creating a tagged +pairlist of the objects to be saved and serializing that single object. +load reads the header line, unserializes a single object (a +pairlist or a vector list) and assigns the elements of the object in the +specified environment. The header line serves two purposes in R: it +identifies the serialization format so load can switch to the +appropriate reader code, and the linefeed allows the detection of files +which have been subjected to a non-binary transfer which re-mapped line +endings. It can also be thought of as a ‘magic number’ in the sense +used by the file program (although R save files are not yet +by default known to that program). +

+

Serialization in R needs to take into account that objects may +contain references to environments, which then have enclosing +environments and so on. (Environments recognized as package or name +space environments are saved by name.) There are ‘reference objects’ +which are not duplicated on copy and should remain shared on +unserialization. These are weak references, external pointers and +environments other than those associated with packages, namespaces and +the global environment. These are handled via a hash table, and +references after the first are written out as a reference marker indexed +by the table entry. +

+

Version-2 serialization first writes a header indicating the format +(normally ‘X\n’ for an XDR format binary save, but ‘A\n’, +ASCII, and ‘B\n’, native word-order binary, can also occur) and +then three integers giving the version of the format and two R +versions (packed by the R_Version macro from Rversion.h). +(Unserialization interprets the two versions as the version of R +which wrote the file followed by the minimal version of R needed to +read the format.) Serialization then writes out the object recursively +using function WriteItem in file src/main/serialize.c. +

+

Some objects are written as if they were SEXPTYPEs: such +pseudo-SEXPTYPEs cover R_NilValue, R_EmptyEnv, +R_BaseEnv, R_GlobalEnv, R_UnboundValue, +R_MissingArg and R_BaseNamespace. +

+

For all SEXPTYPEs except NILSXP, SYMSXP and +ENVSXP serialization starts with an integer with the +SEXPTYPE in bits 0:77 followed by the object bit, two bits +indicating if there are any attributes and if there is a tag (for the +pairlist types), an unused bit and then the gp +field8 in +bits 12:27. Pairlist-like objects write their attributes (if any), tag +(if any), CAR and then CDR (using tail recursion): other objects write +their attributes after themselves. Atomic vector objects write their +length followed by the data: generic vector-list objects write their +length followed by a call to WriteItem for each element. The +code for CHARSXPs special-cases NA_STRING and writes it as +length -1 with no data. Lengths no more than 2^31 - 1 are +written in that way and larger lengths (which only occur on 64-bit +systems) as -1 followed by the upper and lower 32-bits as integers +(regarded as unsigned). +

+

Environments are treated in several ways: as we have seen, some are +written as specific pseudo-SEXPTYPEs. Package and namespace +environments are written with pseudo-SEXPTYPEs followed by the +name. ‘Normal’ environments are written out as ENVSXPs with an +integer indicating if the environment is locked followed by the +enclosure, frame, ‘tag’ (the hash table) and attributes. +

+

In the ‘XDR’ format integers and doubles are written in bigendian order: +however the format is not fully XDR (as defined in RFC 1832) as byte +quantities (such as the contents of CHARSXP and RAWSXP +types) are written as-is and not padded to a multiple of four bytes. +

+

The ‘ASCII’ format writes 7-bit characters. Integers are formatted with +%d (except that NA_integer_ is written as NA), +doubles formatted with %.16g (plus NA, Inf and +-Inf) and bytes with %02x. Strings are written using +standard escapes (e.g. \t and \013) for non-printing and +non-ASCII bytes. +

+
+ + + +

1.9 Encodings for CHARSXPs

+ +

Character data in R are stored in the sexptype CHARSXP. +

+

There is support for encodings other than that of the current locale, in +particular UTF-8 and the multi-byte encodings used on Windows for CJK +languages. A limited means to indicate the encoding of a CHARSXP +is via two of the ‘general purpose’ bits which are used to declare +the encoding to be either Latin-1 or UTF-8. (Note that it is possible +for a character vector to contain elements in different encodings.) +Both printing and plotting notice the declaration and convert the string +to the current locale (possibly using <xx> to display in +hexadecimal bytes that are not valid in the current locale). Many (but +not all) of the character manipulation functions will either preserve +the declaration or re-encode the character string. +

+

Strings that refer to the OS such as file names need to be passed +through a wide-character interface on some OSes (e.g. Windows). +

+

When are character strings declared to be of known encoding? One way is +to do so directly via Encoding. The parser declares the encoding +if this is known, either via the encoding argument to +parse or from the locale within which parsing is being done at +the R command line. (Other ways are recorded on the help page for +Encoding.) +

+

It is not necessary to declare the encoding of ASCII strings +as they will work in any locale. ASCII strings should never +have a marked encoding, as any encoding will be ignored when entering +such strings into the CHARSXP cache. +

+

The rationale behind considering only UTF-8 and Latin-1 was that most +systems are capable of producing UTF-8 strings and this is the nearest +we have to a universal format. For those that do not (for example those +lacking a powerful enough iconv), it is likely that they work in +Latin-1, the old R assumption. The the parser can return a +UTF-8-encoded string if it encounters a ‘\uxxx’ escape for a +Unicode point that cannot be represented in the current charset. (This +needs MBCS support, and was only enabled9 on +Windows.) This is enabled for all platforms, and a ‘\uxxx’ or +‘\Uxxxxxxxx’ escape ensures that the parsed string will be marked +as UTF-8. +

+

Most of the character manipulation functions now preserve UTF-8 +encodings: there are some notes as to which at the top of file +src/main/character.c and in file +src/library/base/man/Encoding.Rd. +

+

Graphics devices are offered the possibility of handing UTF-8-encoded +strings without re-encoding to the native character set, by setting +hasTextUTF8 to be ‘TRUE’ and supplying functions +textUTF8 and strWidthUTF8 that expect UTF-8-encoded +inputs. Normally the symbol font is encoded in Adobe Symbol encoding, +but that can be re-encoded to UTF-8 by setting wantSymbolUTF8 to +‘TRUE’. The Windows’ port of cairographics has a rather peculiar +assumption: it wants the symbol font to be encoded in UTF-8 as if it +were encoded in Latin-1 rather than Adobe Symbol: this is selected by +wantSymbolUTF8 = NA_LOGICAL. +

+

Windows has no UTF-8 locales, but rather expects to work with +UCS-210 strings. +R (being written in standard C) would not work internally with UCS-2 +without extensive changes. The Rgui console11 uses UCS-2 internally, but communicates with the R +engine in the native encoding. To allow UTF-8 strings to be printed in +UTF-8 in Rgui.exe, an escape convention is used (see header file +rgui_UTF8.h) which is used by cat, print and +autoprinting. +

+

‘Unicode’ (UCS-2LE) files are common in the Windows world, and +readLines and scan will read them into UTF-8 strings on +Windows if the encoding is declared explicitly on an unopened +connection passed to those functions. +

+
+ + + +

1.10 The CHARSXP cache

+ + +

There is a global cache for CHARSXPs created by mkChar — +the cache ensures that most CHARSXPs with the same contents share +storage (‘contents’ including any declared encoding). Not all +CHARSXPs are part of the cache – notably ‘NA_STRING’ is +not. CHARSXPs reloaded from the save formats of R prior +to 0.99.0 are not cached (since the code used is frozen and very few +examples still exist). +

+ +

The cache records the encoding of the string as well as the bytes: all +requests to create a CHARSXP should be via a call to +mkCharLenCE. Any encoding given in mkCharLenCE call will +be ignored if the string’s bytes are all ASCII characters. +

+ +
+ + + +

1.11 Warnings and errors

+ + + + + + +

Each of warning and stop have two C-level equivalents, +warning, warningcall, error and errorcall. +The relationship between the pairs is similar: warning tries to +fathom out a suitable call, and then calls warningcall with that +call as the first argument if it succeeds, and with call = +R_NilValue if it does not. When warningcall is called, it +includes the deparsed call in its printout unless call = +R_NilValue. +

+

warning and error look at the context stack. If the +topmost context is not of type CTXT_BUILTIN, it is used to +provide the call, otherwise the next context provides the call. +This means that when these functions are called from a primitive or +.Internal, the imputed call will not be to +primitive/.Internal but to the function calling the +primitive/.Internal . This is exactly what one wants for a +.Internal, as this will give the call to the closure wrapper. +(Further, for a .Internal, the call is the argument to +.Internal, and so may not correspond to any R function.) +However, it is unlikely to be what is needed for a primitive. +

+

The upshot is that that warningcall and errorcall should +normally be used for code called from a primitive, and warning +and error should be used for code called from a .Internal +(and necessarily from .Call, .C and so on, where the call +is not passed down). However, there are two complications. One is that +code might be called from either a primitive or a .Internal, in +which case probably warningcall is more appropriate. The other +involves replacement functions, where the call was once of the form +

+
> length(x) <- y ~ x
+Error in "length<-"(`*tmp*`, value = y ~ x) : invalid value
+
+ +

which is unpalatable to the end user. For replacement functions there +will be a suitable context at the top of the stack, so warning +should be used. (The results for .Internal replacement functions +such as substr<- are not ideal.) +

+ + +
+ + + +

1.12 S4 objects

+ +

[This section is currently a preliminary draft and should not be taken +as definitive. The description assumes that R_NO_METHODS_TABLES +has not been set.] +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: S4 objects   [Contents][Index]

+
+ +

1.12.1 Representation of S4 objects

+ +

S4 objects can be of any SEXPTYPE. They are either an object of +a simple type (such as an atomic vector or function) with S4 class +information or of type S4SXP. In all cases, the ‘S4 bit’ (bit 4 +of the ‘general purpose’ field) is set, and can be tested by the +macro/function IS_S4_OBJECT. +

+

S4 objects are created via new()12 and thence via the C +function R_do_new_object. This duplicates the prototype of the +class, adds a class attribute and sets the S4 bit. All S4 class +attributes should be character vectors of length one with an attribute +giving (as a character string) the name of the package (or +.GlobalEnv) containing the class definition. Since S4 objects +have a class attribute, the OBJECT bit is set. +

+

It is currently unclear what should happen if the class attribute is +removed from an S4 object, or if this should be allowed. +

+
+ + + +

1.12.2 S4 classes

+ +

S4 classes are stored as R objects in the environment in which they +are created, with names .__C__classname: as such they are +not listed by default by ls. +

+

The objects are S4 objects of class "classRepresentation" which +is defined in the methods package. +

+

Since these are just objects, they are subject to the normal scoping +rules and can be imported and exported from namespaces like other +objects. The directives importClassesFrom and +exportClasses are merely convenient ways to refer to class +objects without needing to know their internal ‘metaname’ (although +exportClasses does a little sanity checking via isClass). +

+
+ +
+

+Next: , Previous: , Up: S4 objects   [Contents][Index]

+
+ +

1.12.3 S4 methods

+ +

Details of methods are stored in S4 objects of class +"MethodsList". They have a non-syntactic name of the form +.__M__generic:package for all methods defined in the +current environment for the named generic derived from a specific +package (which might be .GlobalEnv). +

+

There is also environment .__T__generic:package which +has names the signatures of the methods defined, and values the +corresponding method functions. This is often referred to as a ‘methods +table’. +

+

When a package without a namespace is attached these objects become +visible on the search path. library calls +methods:::cacheMetaData to update the internal tables. +

+

During an R session there is an environment associated with each +non-primitive generic containing objects .AllMTable, +.Generic, .Methods, .MTable, .SigArgs and +.SigLength. .MTable and AllMTable are merged +methods tables containing all the methods defined directly and via +inheritance respectively. .Methods is a merged methods list. +

+

Exporting methods from a namespace is more complicated than exporting a +class. Note first that you do not export a method, but rather the +directive exportMethods will export all the methods defined in +the namespace for a specified generic: the code also adds to the list +of generics any that are exported directly. For generics which are +listed via exportMethods or exported themselves, the +corresponding "MethodsList" and environment are exported and so +will appear (as hidden objects) in the package environment. +

+

Methods for primitives which are internally S4 generic (see below) are +always exported, whether mentioned in the NAMESPACE file or not. +

+

Methods can be imported either via the directive +importMethodsFrom or via importing a namespace by import. +Also, if a generic is imported via importFrom, its methods are +also imported. In all cases the generic will be imported if it is in +the namespace, so importMethodsFrom is most appropriate for +methods defined on generics in other packages. Since methods for a +generic could be imported from several different packages, the methods +tables are merged. +

+

When a package with a namespace is attached +methods:::cacheMetaData is called to update the internal tables: +only the visible methods will be cached. +

+ +
+ +
+

+Previous: , Up: S4 objects   [Contents][Index]

+
+ +

1.12.4 Mechanics of S4 dispatch

+ +

This subsection does not discuss how S4 methods are chosen: see +https://developer.r-project.org/howMethodsWork.pdf. +

+

For all but primitive functions, setting a method on an existing +function that is not itself S4 generic creates a new object in the +current environment which is a call to standardGeneric with the +old definition as the default method. Such S4 generics can also be +created via a call to setGeneric13 and are standard closures +in the R language, with environment the environment within which they +are created. With the advent of namespaces this is somewhat +problematic: if myfn was previously in a package with a name +space there will be two functions called myfn on the search +paths, and which will be called depends on which search path is in use. +This is starkest for functions in the base namespace, where the +original will be found ahead of the newly created function from any +other package with a namespace. +

+

Primitive functions are treated quite differently, for efficiency +reasons: this results in different semantics. setGeneric is +disallowed for primitive functions. The methods namespace +contains a list .BasicFunsList named by primitive functions: +the entries are either FALSE or a standard S4 generic showing +the effective definition. When setMethod (or +setReplaceMethod) is called, it either fails (if the list entry +is FALSE) or a method is set on the effective generic given in +the list. +

+

Actual dispatch of S4 methods for almost all primitives piggy-backs on +the S3 dispatch mechanism, so S4 methods can only be dispatched for +primitives which are internally S3 generic. When a primitive that is +internally S3 generic is called with a first argument which is an S4 +object and S4 dispatch is on (that is, the methods namespace is +loaded), DispatchOrEval calls R_possible_dispatch (defined +in file src/main/objects.c). (Members of the S3 group generics, +which includes all the generic operators, are treated slightly +differently: the first two arguments are checked and +DispatchGroup is called.) R_possible_dispatch first +checks an internal table to see if any S4 methods are set for that +generic (and S4 dispatch is currently enabled for that generic), and if +so proceeds to S4 dispatch using methods stored in another internal +table. All primitives are in the base namespace, and this mechanism +means that S4 methods can be set for (some) primitives and will always +be used, in contrast to setting methods on non-primitives. +

+

The exception is %*%, which is S4 generic but not S3 generic as +its C code contains a direct call to R_possible_dispatch. +

+

The primitive as.double is special, as as.numeric and +as.real are copies of it. The methods package code partly +refers to generics by name and partly by function, and maps +as.double and as.real to as.numeric (since that is +the name used by packages exporting methods for it). +

+

Some elements of the language are implemented as primitives, for example +}. This includes the subset and subassignment ‘functions’ and +they are S4 generic, again piggybacking on S3 dispatch. +

+

.BasicFunsList is generated when methods is installed, by +computing all primitives, initially disallowing methods on all and then +setting generics for members of .GenericArgsEnv, the S4 group +generics and a short exceptions list in file BasicFunsList.R: this +currently contains the subsetting and subassignment operators and an +override for c. +

+
+ + + +

1.13 Memory allocators

+ +

R’s memory allocation is almost all done via routines in file +src/main/memory.c. It is important to keep track of where memory +is allocated, as the Windows port (by default) makes use of a memory +allocator that differs from malloc etc as provided by MinGW. +Specifically, there are entry points Rm_malloc, Rm_free, +Rm_calloc and Rm_free provided by file +src/gnuwin32/malloc.c. This was done for two reasons. The +primary motivation was performance: the allocator provided by MSVCRT +via MinGW was far too slow at handling the many small allocations +that the allocation system for SEXPRECs uses. As a side benefit, +we can set a limit on the amount of allocated memory: this is useful as +whereas Windows does provide virtual memory it is relatively far slower +than many other R platforms and so limiting R’s use of swapping is +highly advantageous. The high-performance allocator is only called from +src/main/memory.c, src/main/regex.c, src/extra/pcre +and src/extra/xdr: note that this means that it is not used in +packages. +

+

The rest of R should where possible make use of the allocators made +available by file src/main/memory.c, which are also the methods +recommended in +Memory allocation in Writing R Extensions + + + + +for use in R packages, namely the use of R_alloc, +Calloc, Realloc and Free. Memory allocated by +R_alloc is freed by the garbage collector once the ‘watermark’ +has been reset by calling + +vmaxset. This is done automatically by the wrapper code calling +primitives and .Internal functions (and also by the wrapper code +to .Call and .External), but + +vmaxget and vmaxset can be used to reset the watermark +from within internal code if the memory is only required for a short +time. +

+ +

All of the methods of memory allocation mentioned so far are relatively +expensive. All R platforms support alloca, and in almost all +cases14 this is managed by the +compiler, allocates memory on the C stack and is very efficient. +

+

There are two disadvantages in using alloca. First, it is +fragile and care is needed to avoid writing (or even reading) outside +the bounds of the allocation block returned. Second, it increases the +danger of overflowing the C stack. It is suggested that it is only +used for smallish allocations (up to tens of thousands of bytes), and +that +

+ +
+
    R_CheckStack();
+
+ +

is called immediately after the allocation (as R’s stack checking +mechanism will warn far enough from the stack limit to allow for modest +use of alloca). (do_makeunique in file src/main/unique.c +provides an example of both points.) +

+

There is an alternative check, + +

+
    R_CheckStack2(size_t extra);
+
+ +

to be called immediately before trying an allocation of +extra bytes. +

+

An alternative strategy has been used for various functions which +require intermediate blocks of storage of varying but usually small +size, and this has been consolidated into the routines in the header +file src/main/RBufferUtils.h. This uses a structure which +contains a buffer, the current size and the default size. A call to + +

+
    R_AllocStringBuffer(size_t blen, R_StringBuffer *buf);
+
+ +

sets buf->data to a memory area of at least blen+1 bytes. +At least the default size is used, which means that for small +allocations the same buffer can be reused. A call to + + +R_FreeStringBufferL releases memory if more than the default has +been allocated whereas a call to R_FreeStringBuffer frees any +memory allocated. +

+

The R_StringBuffer structure needs to be initialized, for example by +

+
+
static R_StringBuffer ex_buff = {NULL, 0, MAXELTSIZE};
+
+ +

which uses a default size of MAXELTSIZE = 8192 bytes. Most +current uses have a static R_StringBuffer structure, which +allows the (default-sized) buffer to be shared between calls to e.g. +grep and even between functions: this will need to be changed if +R ever allows concurrent evaluation threads. So the idiom is +

+
+
static R_StringBuffer ex_buff = {NULL, 0, MAXELTSIZE};
+...
+    char *buf;
+    for(i = 0; i < n; i++) {
+        compute len
+        buf = R_AllocStringBuffer(len, &ex_buff);
+        use buf
+    }
+    /*  free allocation if larger than the default, but leave
+        default allocated for future use */
+   R_FreeStringBufferL(&ex_buff);
+
+ + + + + + +
+ + + +

1.13.1 Internals of R_alloc

+ +

The memory used by R_alloc is allocated as R vectors, of type +RAWSXP. Thus the allocation is in units of 8 bytes, and is +rounded up. A request for zero bytes currently returns NULL (but +this should not be relied on). For historical reasons, in all other +cases 1 byte is added before rounding up so the allocation is always +1–8 bytes more than was asked for: again this should not be relied on. +

+

The vectors allocated are protected via the setting of R_VStack, +as the garbage collector marks everything that can be reached from that +location. When a vector is R_allocated, its ATTRIB +pointer is set to the current R_VStack, and R_VStack is +set to the latest allocation. Thus R_VStack is a single-linked +chain of the vectors currently allocated via R_alloc. Function +vmaxset resets the location R_VStack, and should be to a +value that has previously be obtained via vmaxget: +allocations after the value was obtained will no longer be protected and +hence available for garbage collection. +

+
+ + + +

1.14 Internal use of global and base environments

+ +

This section notes known use by the system of these environments: the +intention is to minimize or eliminate such uses. +

+ + + + + +
+ + + +

1.14.1 Base environment

+ + + + + +

The graphics devices system maintains two variables .Device and +.Devices in the base environment: both are always set. The +variable .Devices gives a list of character vectors of the names +of open devices, and .Device is the element corresponding to the +currently active device. The null device will always be open. +

+ +

There appears to be a variable .Options, a pairlist giving the +current options settings. But in fact this is just a symbol with a +value assigned, and so shows up as a base variable. +

+ +

Similarly, the evaluator creates a symbol .Last.value which +appears as a variable in the base environment. +

+ + +

Errors can give rise to objects .Traceback and +last.warning in the base environment. +

+
+ + + +

1.14.2 Global environment

+ + + + +

The seed for the random number generator is stored in object +.Random.seed in the global environment. +

+ +

Some error handlers may give rise to objects in the global environment: +for example dump.frames by default produces last.dump. +

+ +

The windows() device makes use of a variable .SavedPlots +to store display lists of saved plots for later display. This is +regarded as a variable created by the user. +

+ +
+ + + +

1.15 Modules

+ + +

R makes use of a number of shared objects/DLLs stored in the +modules directory. These are parts of the code which have been +chosen to be loaded ‘on demand’ rather than linked as dynamic libraries +or incorporated into the main executable/dynamic library. +

+

For the remaining modules the motivation has been the amount of (often +optional) code they will bring in via libraries to which they are +linked. +

+
+
internet
+

The internal HTTP and FTP clients and socket support, which link to +system-specific support libraries. This may load libcurl and on +Windows will load wininet.dll and ws2_32.dll. +

+
+
lapack
+

The code which makes use of the LAPACK library, and is linked to +libRlapack or an external LAPACK library. +

+
+
X11
+

(Unix-alikes only.) The X11(), jpeg(), png() and +tiff() devices. These are optional, and links to some or all of +the X11, pango, cairo, jpeg, libpng +and libtiff libraries. +

+
+ +
+ +
+

+Next: , Previous: , Up: R Internal Structures   [Contents][Index]

+
+ +

1.16 Visibility

+ + + + + + + +
+ + + +

1.16.1 Hiding C entry points

+ +

We make use of the visibility mechanisms discussed in +Controlling visibility in Writing R Extensions, +C entry points not needed outside the main R executable/dynamic +library (and in particular in no package nor module) should be prefixed +by attribute_hidden. + +Minimizing the visibility of symbols in the R dynamic library will +speed up linking to it (which packages will do) and reduce the +possibility of linking to the wrong entry points of the same name. In +addition, on some platforms reducing the number of entry points allows +more efficient versions of PIC to be used: somewhat over half the entry +points are hidden. A convenient way to hide variables (as distinct from +functions) is to declare them extern0 in header file Defn.h. +

+

The visibility mechanism used is only available with some compilers and +platforms, and in particular not on Windows, where an alternative +mechanism is used. Entry points will not be made available in +R.dll if they are listed in the file +src/gnuwin32/Rdll.hide. + +Entries in that file start with a space and must be strictly in +alphabetic order in the C locale (use sort on the file to +ensure this if you change it). It is possible to hide Fortran as well +as C entry points via this file: the former are lower-cased and have an +underline as suffix, and the suffixed name should be included in the +file. Some entry points exist only on Windows or need to be visible +only on Windows, and some notes on these are provided in file +src/gnuwin32/Maintainters.notes. +

+

Because of the advantages of reducing the number of visible entry +points, they should be declared attribute_hidden where possible. +Note that this only has an effect on a shared-R-library build, and so +care is needed not to hide entry points that are legitimately used by +packages. So it is best if the decision on visibility is made when a +new entry point is created, including the decision if it should be +included in header file Rinternals.h. A list of the visible +entry points on shared-R-library build on a reasonably standard +Unix-alike can be made by something like +

+
+
nm -g libR.so | grep ‘ [BCDT] ’ | cut -b20-
+
+ +
+ +
+

+Previous: , Up: Visibility   [Contents][Index]

+
+ +

1.16.2 Variables in Windows DLLs

+ +

Windows is unique in that it conventionally treats importing variables +differently from functions: variables that are imported from a DLL need +to be specified by a prefix (often ‘_imp_’) when being linked to +(‘imported’) but not when being linked from (‘exported’). The details +depend on the compiler system, and have changed for MinGW during the +lifetime of that port. They are in the main hidden behind some macros +defined in header file R_ext/libextern.h. +

+

A (non-function) variable in the main R sources that needs to be +referred to outside R.dll (in a package, module or another DLL +such as Rgraphapp.dll) should be declared with prefix +LibExtern. The main use is in Rinternals.h, but it needs +to be considered for any public header and also Defn.h. +

+

It would nowadays be possible to make use of the ‘auto-import’ feature +of the MinGW port of ld to fix up imports from DLLs (and if +R is built for the Cygwin platform this is what happens). However, +this was not possible when the MinGW build of R was first constructed +in ca 1998, allows less control of visibility and would not work for +other Windows compiler suites. +

+

It is only possible to check if this has been handled correctly by +compiling the R sources on Windows. +

+
+ +
+

+Previous: , Up: R Internal Structures   [Contents][Index]

+
+ +

1.17 Lazy loading

+ +

Lazy loading is always used for code in packages but is optional +(selected by the package maintainer) for datasets in packages. When a +package/namespace which uses it is loaded, the package/namespace +environment is populated with promises for all the named objects: when +these promises are evaluated they load the actual code from a database. +

+

There are separate databases for code and data, stored in the R +and data subdirectories. The database consists of two files, +name.rdb and name.rdx. The .rdb file +is a concatenation of serialized objects, and the .rdx file +contains an index. The objects are stored in (usually) a +gzip-compressed format with a 4-byte header giving the +uncompressed serialized length (in XDR, that is big-endian, byte order) +and read by a call to the primitive lazyLoadDBfetch. (Note that +this makes lazy-loading unsuitable for really large objects: the +unserialized length of an R object can exceed 4GB.) +

+

The index or ‘map’ file name.rdx is a compressed serialized +R object to be read by readRDS. It is a list with three +elements variables, references and compressed. The +first two are named lists of integer vectors of length 2 giving the +offset and length of the serialized object in the name.rdb +file. Element variables has an entry for each named object: +references serializes a temporary environment used when named +environments are added to the database. compressed is a logical +indicating if the serialized objects were compressed: compression is +always used nowadays. We later added the values compressed = 2 +and 3 for bzip2 and xz compression (with the +possibility of future expansion to other methods): these formats add a +fifth byte to the header for the type of compression, and store +serialized objects uncompressed if compression expands them. +

+

The loader for a lazy-load database of code or data is function +lazyLoad in the base package, but note that there is a +separate copy to load base itself in file +R_HOME/base/R/base. +

+

Lazy-load databases are created by the code in +src/library/tools/R/makeLazyLoad.R: the main tool is the +unexported function makeLazyLoadDB and the insertion of database +entries is done by calls to .Call("R_lazyLoadDBinsertValue", +...). +

+

Lazy-load databases of less than 10MB are cached in memory at first use: +this was found necessary when using file systems with high latency +(removable devices and network-mounted file systems on Windows). +

+

Lazy-load databases are loaded into the exports for a package, but not +into the namespace environment itself. Thus they are visible when the +package is attached, and also via the :: operator. +This was a deliberate design decision, as packages mostly make datasets +available for use by the end user (or other packages), and they should +not be found preferentially from functions in the package, surprising +users who expected the normal search path to be used. (There is an +alternative mechanism, sysdata.rda, for ‘system datasets’ that +are intended primarily to be used within the package.) +

+

The same database mechanism is used to store parsed Rd files. +One or all of the parsed objects is fetched by a call to +tools:::fetchRdDB. +

+
+ + + +

2 .Internal vs .Primitive

+ + + +

C code compiled into R at build time can be called directly in what +are termed primitives or via the .Internal interface, +which is very similar to the .External interface except in +syntax. More precisely, R maintains a table of R function names and +corresponding C functions to call, which by convention all start with +‘do_’ and return a SEXP. This table (R_FunTab in +file src/main/names.c) also specifies how many arguments to a +function are required or allowed, whether or not the arguments are to be +evaluated before calling, and whether the function is ‘internal’ in +the sense that it must be accessed via the .Internal interface, +or directly accessible in which case it is printed in R as +.Primitive. +

+

Functions using .Internal() wrapped in a closure are in general +preferred as this ensures standard handling of named and default +arguments. For example, grep is defined as +

+
+
grep <-
+function (pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
+         fixed = FALSE, useBytes = FALSE, invert = FALSE)
+{
+    if (!is.character(x)) x <- structure(as.character(x), names = names(x))
+    .Internal(grep(as.character(pattern), x, ignore.case, value,
+                   perl, fixed, useBytes, invert))
+}
+
+
+

and the use of as.character allows methods to be dispatched (for +example, for factors). +

+

However, for reasons of convenience and also efficiency (as there is +some overhead in using the .Internal interface wrapped in a +function closure), the primitive functions are exceptions that can be +accessed directly. And of course, primitive functions are needed for +basic operations—for example .Internal is itself a primitive. +Note that primitive functions make no use of R code, and hence are +very different from the usual interpreted functions. In particular, +formals and body return NULL for such objects, and +argument matching can be handled differently. For some primitives +(including call, switch, .C and .subset) +positional matching is important to avoid partial matching of the first +argument. +

+

The list of primitive functions is subject to change; currently, it +includes the following. +

+
    +
  1. “Special functions” which really are language elements, but +implemented as primitive functions: + +
    +
    {       (         if     for      while  repeat  break  next
    +return  function  quote  switch
    +
    + +
  2. Language elements and basic operators (i.e., functions usually +not called as foo(a, b, ...)) for subsetting, assignment, +arithmetic, comparison and logic: + +
    +
                   [    [[    $    @
    +<-   <<-  =    [<-  [[<-  $<-  @<-
    +
    ++    -    *    /     ^    %%   %*%  %/%
    +<    <=   ==   !=    >=   >
    +|    ||   &    &&    !
    +
    + +

    When the arithmetic, comparison and logical operators are called as +functions, any argument names are discarded so positional matching is used. +

    +
  3. “Low level” 0– and 1–argument functions which belong to one of the +following groups of functions: + +
      +
    1. Basic mathematical functions with a single argument, i.e., + +
      +
      abs     sign    sqrt
      +floor   ceiling
      +
      +
      exp     expm1
      +log2    log10   log1p
      +cos     sin     tan
      +acos    asin    atan
      +cosh    sinh    tanh
      +acosh   asinh   atanh
      +cospi   sinpi   tanpi
      +
      +
      gamma   lgamma  digamma trigamma
      +
      +
      cumsum  cumprod cummax  cummin
      +
      +
      Im  Re  Arg  Conj  Mod
      +
      + +

      log is a primitive function of one or two arguments with named +argument matching. +

      +

      trunc is a difficult case: it is a primitive that can have one +or more arguments: the default method handled in the primitive has +only one. +

      +
    2. Functions rarely used outside of “programming” (i.e., mostly used +inside other functions), such as + +
      +
      nargs          missing        on.exit        interactive
      +as.call        as.character   as.complex     as.double
      +as.environment as.integer     as.logical     as.raw
      +is.array       is.atomic      is.call        is.character
      +is.complex     is.double      is.environment is.expression
      +is.finite      is.function    is.infinite    is.integer
      +is.language    is.list        is.logical     is.matrix
      +is.na          is.name        is.nan         is.null
      +is.numeric     is.object      is.pairlist    is.raw
      +is.real        is.recursive   is.single      is.symbol
      +baseenv        emptyenv       globalenv      pos.to.env
      +unclass        invisible      seq_along      seq_len
      +
      + +
    3. The programming and session management utilities + +
      +
      browser  proc.time  gc.time tracemem retracemem untracemem
      +
      + +
    + +
  4. The following basic replacement and extractor functions + +
    +
    length      length<-
    +class       class<-
    +oldClass    oldCLass<-
    +attr        attr<-
    +attributes  attributes<-
    +names       names<-
    +dim         dim<-
    +dimnames    dimnames<-
    +            environment<-
    +            levels<-
    +            storage.mode<-
    +
    + + +

    Note that optimizing NAMED = 1 is only effective within a +primitive (as the closure wrapper of a .Internal will set +NAMED = 2 when the promise to the argument is evaluated) and +hence replacement functions should where possible be primitive to avoid +copying (at least in their default methods). +

    +
  5. The following functions are primitive for efficiency reasons: + +
    +
    :          ~          c           list
    +call       expression substitute
    +UseMethod  standardGeneric
    +.C         .Fortran   .Call       .External
    +round      signif      rep        seq.int
    +
    + +

    as well as the following internal-use-only functions +

    +
    +
    .Primitive     .Internal
    +.Call.graphics .External.graphics
    +.subset        .subset2
    +.primTrace     .primUntrace
    +lazyLoadDBfetch
    +
    + +
+ + +

The multi-argument primitives +

+
call       switch
+.C         .Fortran   .Call       .External
+
+ +

intentionally use positional matching, and need to do so to avoid +partial matching to their first argument. They do check that the first +argument is unnamed or for the first two, partially matches the formal +argument name. On the other hand, +

+
+
attr       attr<-     browser     rememtrace substitute  UseMethod
+log        round      signif      rep        seq.int
+
+ +

manage their own argument matching and do work in the standard way. +

+

All the one-argument primitives check that if they are called with a +named argument that this (partially) matches the name given in the +documentation: this is also done for replacement functions with one +argument plus value. +

+

The net effect is that argument matching for primitives intended for +end-user use as functions is done in the same way as for +interpreted functions except for the six exceptions where positional +matching is required. +

+ + + + + + + +
+ + + +

2.1 Special primitives

+ +

A small number of primitives are specials rather than +builtins, that is they are entered with unevaluated arguments. +This is clearly necessary for the language constructs and the assignment +operators, as well as for && and || which conditionally +evaluate their second argument, and ~, .Internal, +call, expression, missing, on.exit, +quote and substitute which do not evaluate some of their +arguments. +

+

rep and seq.int are special as they evaluate some of their +arguments conditional on which are non-missing. +

+

log, round and signif are special to allow default +values to be given to missing arguments. +

+

The subsetting, subassignment and @ operators are all special. +(For both extraction and replacement forms, $ and @ +take a symbol argument, and [ and [[ allow missing +arguments.) +

+

UseMethod is special to avoid the additional contexts added to +calls to builtins. +

+
+ + + +

2.2 Special internals

+ +

There are also special .Internal functions: NextMethod, +Recall, withVisible, cbind, rbind (to allow +for the deparse.level argument), eapply, lapply and +vapply. +

+
+ + + +

2.3 Prototypes for primitives

+ +

Prototypes are available for the primitive functions and operators, and +these are used for printing, args and package checking (e.g. by +tools::checkS3methods and by package codetools). There are +two environments in the base package (and namespace), +‘.GenericArgsEnv’ for those primitives which are internal S3 +generics, and ‘.ArgsEnv’ for the rest. Those environments contain +closures with the same names as the primitives, formal arguments derived +(manually) from the help pages, a body which is a suitable call to +UseMethod or NULL and environment the base namespace. +

+

The C code for print.default and args uses the closures in +these environments in preference to the definitions in base (as +primitives). +

+

The QC function undoc checks that all the functions prototyped in +these environments are currently primitive, and that the primitives not +included are better thought of as language elements (at the time of +writing +

+
+
$  $<-  &&  (  :  @  @<-  [  [[  [[<-  [<-  {  ||  ~  <-  <<-  =
+break  for function  if  next  repeat  return  while
+
+ +

). One could argue about ~, but it is known to the parser and has +semantics quite unlike a normal function. And : is documented +with different argument names in its two meanings.) +

+

The QC functions codoc and checkS3methods also make use of +these environments (effectively placing them in front of base in the +search path), and hence the formals of the functions they contain are +checked against the help pages by codoc. However, there are two +problems with the generic primitives. The first is that many of the +operators are part of the S3 group generic Ops and that defines +their arguments to be e1 and e2: although it would be very +unusual, an operator could be called as e.g. "+"(e1=a, e2=b) +and if method dispatch occurred to a closure, there would be an argument +name mismatch. So the definitions in environment .GenericArgsEnv +have to use argument names e1 and e2 even though the +traditional documentation is in terms of x and y: +codoc makes the appropriate adjustment via +tools:::.make_S3_primitive_generic_env. The second discrepancy +is with the Math group generics, where the group generic is +defined with argument list (x, ...), but most of the members only +allow one argument when used as the default method (and round and +signif allow two as default methods): again fix-ups are used. +

+

Those primitives which are in .GenericArgsEnv are checked (via +tests/primitives.R) to be generic via defining methods for +them, and a check is made that the remaining primitives are probably not +generic, by setting a method and checking it is not dispatched to (but +this can fail for other reasons). However, there is no certain way to +know that if other .Internal or primitive functions are not +internally generic except by reading the source code. +

+
+ + + +

2.4 Adding a primitive

+ +

[For R-core use: reverse this procedure to remove a primitive. Most +commonly this is done by changing a .Internal to a primitive or +vice versa.] +

+

Primitives are listed in the table R_FunTab in +src/main/names.c: primitives have ‘Y = 0’ in the ‘eval’ +field. +

+

There needs to be an ‘\alias’ entry in a help file in the base +package, and the primitive needs to be added to one of the lists at the +start of this section. +

+

Some primitives are regarded as language elements (the current ones are +listed above). These need to be added to two lists of exceptions, +langElts in undoc() (in file +src/library/tools/R/QC.R) and lang_elements in +tests/primitives.R. +

+

All other primitives are regarded as functions and should be listed in +one of the environments defined in src/library/base/R/zzz.R, +either .ArgsEnv or .GenericArgsEnv: internal generics also +need to be listed in the character vector .S3PrimitiveGenerics. +Note too the discussion about argument matching above: if you add a +primitive function with more than one argument by converting a +.Internal you need to add argument matching to the C code, and +for those with a single argument, add argument-name checking. +

+

Do ensure that make check-devel has been run: that tests most +of these requirements. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

3 Internationalization in the R sources

+ +

The process of marking messages (errors, warnings etc) for translation +in an R package is described in +Internationalization in Writing R Extensions, +and the standard packages included with R have (with an exception in +grDevices for the menus of the windows() device) been +internationalized in the same way as other packages. +

+ + + + + + + + +
+ + + +

3.1 R code

+ +

Internationalization for R code is done in exactly the same way as +for extension packages. As all standard packages which have R code +also have a namespace, it is never necessary to specify domain, +but for efficiency calls to message, warning and +stop should include domain = NA when the message is +constructed via gettextf, gettext or +ngettext. +

+

For each package, the extracted messages and translation sources are +stored under package directory po in the source package, and +compiled translations under inst/po for installation to package +directory po in the installed package. This also applies to C +code in packages. +

+
+ + + +

3.2 Main C code

+ +

The main C code (e.g. that in files src/*/*.c and in +the modules) is where R is closest to the sort of application for +which ‘gettext’ was written. Messages in the main C code are in +domain R and stored in the top-level directory po with +compiled translations under share/locale. +

+

The list of files covered by the R domain is specified in file +po/POTFILES.in. +

+

The normal way to mark messages for translation is via _("msg") +just as for packages. However, sometimes one needs to mark passages for +translation without wanting them translated at the time, for example +when declaring string constants. This is the purpose of the N_ +macro, for example +

+
+
{ ERROR_ARGTYPE,           N_("invalid argument type")},
+
+ +

from file src/main/errors.c. +

+

The P_ macro +

+
+
#ifdef ENABLE_NLS
+#define P_(StringS, StringP, N) ngettext (StringS, StringP, N)
+#else
+#define P_(StringS, StringP, N) (N > 1 ? StringP: StringS)
+#endif
+
+ +

may be used +as a wrapper for ngettext: however in some cases the preferred +approach has been to conditionalize (on ENABLE_NLS) code using +ngettext. +

+

The macro _("msg") can safely be used in directory +src/appl; the header for standalone ‘nmath’ skips possible +translation. (This does not apply to N_ or P_). +

+ +
+ + + +

3.3 Windows-GUI-specific code

+ +

Messages for the Windows GUI are in a separate domain ‘RGui’. This +was done for two reasons: +

+
    +
  • The translators for the Windows version of R might be separate from +those for the rest of R (familiarity with the GUI helps), and + +
  • Messages for Windows are most naturally handled in the native charset +for the language, and in the case of CJK languages the charset is +Windows-specific. (It transpires that as the iconv we ported +works well under Windows, this is less important than anticipated.) +
+ +

Messages for the ‘RGui’ domain are marked by G_("msg"), a +macro that is defined in header file src/gnuwin32/win-nls.h. The +list of files that are considered is hardcoded in the +RGui.pot-update target of file po/Makefile.in.in: note +that this includes devWindows.c as the menus on the +windows device are considered to be part of the GUI. (There is +also GN_("msg"), the analogue of N_("msg").) +

+

The template and message catalogs for the ‘RGui’ domain are in the +top-level po directory. +

+ +
+ + + +

3.4 OS X GUI

+ +

This is handled separately: see +https://developer.r-project.org/Translations30.html. +

+ +
+ + + +

3.5 Updating

+ +

See file po/README for how to update the message templates and catalogs. +

+
+ + + +

4 Structure of an Installed Package

+ + + + + + +

The structure of a source packages is described in Creating R packages in Writing R Extensions: this +chapter is concerned with the structure of installed packages. +

+

An installed package has a top-level file DESCRIPTION, a copy of +the file of that name in the package sources with a ‘Built’ field +appended, and file INDEX, usually describing the objects on which +help is available, a file NAMESPACE if the package has a name +space, optional files such as CITATION, LICENCE and +NEWS, and any other files copied in from inst. It will +have directories Meta, help and html (even if the +package has no help pages), almost always has a directory R and +often has a directory libs to contain compiled code. Other +directories with known meaning to R are data, demo, +doc and po. +

+

Function library looks for a namespace and if one is found +passes control to loadNamespace. Then library or +loadNamespace looks for file R/pkgname, warns if it +is not found and otherwise sources the code (using sys.source) +into the package’s environment, then lazy-loads a database +R/sysdata if present. So how R code gets loaded depends on +the contents of R/pkgname: a standard template to load +lazy-load databases are provided in share/R/nspackloader.R. +

+

Compiled code is usually loaded when the package’s namespace is loaded +by a useDynlib directive in a NAMESPACE file or by the +package’s .onLoad function. Conventionally compiled code is +loaded by a call to library.dynam and this looks in directory +libs (and in an appropriate sub-directory if sub-architectures +are in use) for a shared object (Unix-alike) or DLL (Windows). +

+

Subdirectory data serves two purposes. In a package using +lazy-loading of data, it contains a lazy-load database Rdata, +plus a file Rdata.rds which contain a named character vector used +by data() in the (unusual) event that it is used for such a +package. Otherwise it is a copy of the data directory in the +sources, with saved images re-compressed if R CMD INSTALL +--resave-data was used. +

+

Subdirectory demo supports the demo function, and is +copied from the sources. +

+

Subdirectory po contains (in subdirectories) compiled message +catalogs. +

+
+ +
+

+Next: , Previous: , Up: Package Structure   [Contents][Index]

+
+ +

4.1 Metadata

+ +

Directory Meta contains several files in .rds format, that +is serialized R objects written by saveRDS. All packages +have files Rd.rds, hsearch.rds, links.rds and +package.rds. Packages with namespaces have a file +nsInfo.rds, and those with data, demos or vignettes have +data.rds, demo.rds or vignette.rds files. +

+

The structure of these files (and their existence and names) is private +to R, so the description here is for those trying to follow the R +sources: there should be no reference to these files in non-base +packages. +

+

File package.rds is a dump of information extracted from the +DESCRIPTION file. It is a list of several components. The +first, ‘DESCRIPTION’, is a character vector, the DESCRIPTION +file as read by read.dcf. Further elements ‘Depends’, +‘Suggests’, ‘Imports’, ‘Rdepends’ and ‘Rdepends2’ +record the ‘Depends’, ‘Suggests’ and ‘Imports’ fields. +These are all lists, and can be empty. The first three have an entry +for each package named, each entry being a list of length 1 or 3, which +element ‘name’ (the package name) and optional elements ‘op’ +(a character string) and ‘version’ (an object of class +‘"package_version"’). Element ‘Rdepends’ is used for the +first version dependency on R, and ‘Rdepends2’ is a list of zero +or more R version dependencies—each is a three-element list of the +form described for packages. Element ‘Rdepends’ is no longer used, +but it is still potentially needed so R < 2.7.0 can detect that the +package was not installed for it. +

+

File nsInfo.rds records a list, a parsed version of the +NAMESPACE file. +

+

File Rd.rds records a data frame with one row for each help file. +The columns are ‘File’ (the file name with extension), ‘Name’ +(the ‘\name’ section), ‘Type’ (from the optional +‘\docType’ section), ‘Title’, ‘Encoding’, ‘Aliases’, +‘Concepts’ and ‘Keywords’. All columns are character vectors +apart from ‘Aliases’, which is a list of character vectors. +

+

File hsearch.rds records the information to be used by +‘help.search’. This is a list of four unnamed elements which are +character matrices for help files, aliases, keywords and concepts. All +the matrices have columns ‘ID’ and ‘Package’ which are used to +tie the aliases, keywords and concepts (the remaining column of the last +three elements) to a particular help file. The first element has +further columns ‘LibPath’ (stored as "" and filled in what +the file is loaded), ‘name’, ‘title’, ‘topic’ (the first +alias, used when presenting the results as +‘pkgname::topic’) and ‘Encoding’. +

+

File links.rds records a named character vector, the names being +aliases and the values character strings of the form +

+
"../../pkgname/html/filename.html"
+
+ +

File data.rds records a two-column character matrix with columns +of dataset names and titles from the corresponding help file. File +demo.rds has the same structure for package demos. +

+

File vignette.rds records a data frame with one row for each +‘vignette’ (.[RS]nw file in inst/doc) and with columns +‘File’ (the full file path in the sources), ‘Title’, +‘PDF’ (the pathless file name of the installed PDF version, if +present), ‘Depends’, ‘Keywords’ and ‘R’ (the pathless +file name of the installed R code, if present). +

+ +
+ +
+

+Previous: , Up: Package Structure   [Contents][Index]

+
+ +

4.2 Help

+ +

All installed packages, whether they had any .Rd files or not, +have help and html directories. The latter normally only +contains the single file 00Index.html, the package index which +has hyperlinks to the help topics (if any). +

+

Directory help contains files AnIndex, paths.rds +and pkgname.rd[bx]. The latter two files are a lazy-load +database of parsed .Rd files, accessed by +tools:::fetchRdDB. File paths.rds is a saved character +vector of the original path names of the .Rd files, used when +updating the database. +

+

File AnIndex is a two-column tab-delimited file: the first column +contains the aliases defined in the help files and the second the +basename (without the .Rd or .rd extension) of the file +containing that alias. It is read by utils:::index.search to +search for files matching a topic (alias), and read by scan in +utils:::matchAvailableTopics, part of the completion system. +

+

File aliases.rds is the same information as AnIndex as a +named character vector (names the topics, values the file basename), for +faster access. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

5 Files

+ +

R provides many functions to work with files and directories: many of +these have been added relatively recently to facilitate scripting in +R and in particular the replacement of Perl scripts by R scripts +in the management of R itself. +

+

These functions are implemented by standard C/POSIX library calls, +except on Windows. That means that filenames must be encoded in the +current locale as the OS provides no other means to access the file +system: increasingly filenames are stored in UTF-8 and the OS will +translate filenames to UTF-8 in other locales. So using a UTF-8 locale +gives transparent access to the whole file system. +

+

Windows is another story. There the internal view of filenames is in +UTF-16LE (so-called ‘Unicode’), and standard C library calls can only +access files whose names can be expressed in the current codepage. To +circumvent that restriction, there is a parallel set of Windows-specific +calls which take wide-character arguments for filepaths. Much of the +file-handling in R has been moved over to using these functions, so +filenames can be manipulated in R as UTF-8 encoded character strings, +converted to wide characters (which on Windows are UTF-16LE) and passed +to the OS. The utilities RC_fopen and filenameToWchar +help this process. Currently file.copy to a directory, +list.files, list.dirs and path.expand work only +with filepaths encoded in the current codepage. +

+

All these functions do tilde expansion, in the same way as +path.expand, with the deliberate exception of Sys.glob. +

+

File names may be case sensitive or not: the latter is the norm on +Windows and OS X, the former on other Unix-alikes. Note that this +is a property of both the OS and the file system: it is often possible +to map names to upper or lower case when mounting the file system. This +can affect the matching of patterns in list.files and +Sys.glob. +

+

File names commonly contain spaces on Windows and OS X but not +elsewhere. As file names are handled as character strings by R, +spaces are not usually a concern unless file names are passed to other +process, e.g. by a system call. +

+

Windows has another couple of peculiarities. Whereas a POSIX file +system has a single root directory (and other physical file systems are +mounted onto logical directories under that root), Windows has separate +roots for each physical or logical file system (‘volume’), organized +under drives (with file paths starting D: for an +ASCII letter, case-insensitively) and network shares +(with paths like \netname\topdir\myfiles\a file. There is a +current drive, and path names without a drive part are relative to the +current drive. Further, each drive has a current directory, and +relative paths are relative to that current directory, on a particular +drive if one is specified. So D:dir\file and D: are valid +path specifications (the last being the current directory on drive +D:). +

+ + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

6 Graphics

+ +

R’s graphics internals were re-designed to enable multiple graphics +systems to be installed on top on the graphics ‘engine’ – currently +there are two such systems, one supporting ‘base’ graphics (based on +that in S and whose R code15 is in package +graphics) and one implemented in package grid. +

+

Some notes on the historical changes can be found at +https://www.stat.auckland.ac.nz/~paul/R/basegraph.html and +https://www.stat.auckland.ac.nz/~paul/R/graphicsChanges.html. +

+

At the lowest level is a graphics device, which manages a plotting +surface (a screen window or a representation to be written to a file). +This implements a set of graphics primitives, to ‘draw’ +

+
    +
  • a circle, optionally filled +
  • a rectangle, optionally filled +
  • a line +
  • a set of connected lines +
  • a polygon, optionally filled +
  • a paths, optionally filled using a winding rule +
  • text +
  • a raster image (optional) +
  • and to set a clipping rectangle +
+ +

as well as requests for information such as +

+
    +
  • the width of a string if plotted +
  • the metrics (width, ascent, descent) of a single character +
  • the current size of the plotting surface +
+ +

and requests/opportunities to take action such as +

+
    +
  • start a new ‘page’, possibly after responding to a request to ask +the user for confirmation. +
  • return the position of the device pointer (if any). +
  • when a device become the current device or stops being the current +device (this is usually used to change the window title on a screen +device). +
  • when drawing starts or finishes (e.g. used to flush graphics to +the screen when drawing stops). +
  • wait for an event, for example a mouse click or keypress. +
  • an ‘onexit’ action, to clean up if plotting is interrupted (by an +error or by the user). +
  • capture the current contents of the device as a raster image. +
  • close the device. +
+ +

The device also sets a number of variables, mainly Boolean flags +indicating its capabilities. Devices work entirely in ‘device units’ +which are up to its developer: they can be in pixels, big points (1/72 +inch), twips, …, and can differ16 in the +‘x’ and ‘y’ directions. +

+

The next layer up is the graphics ‘engine’ that is the main interface to +the device (although the graphics subsystems do talk directly to +devices). This is responsible for clipping lines, rectangles and +polygons, converting the pch values 0...26 to sets of +lines/circles, centring (and otherwise adjusting) text, rendering +mathematical expressions (‘plotmath’) and mapping colour descriptions +such as names to the internal representation. +

+ +

Another function of the engine is to manage display lists and snapshots. +Some but not all instances of graphics devices maintain display lists, a +‘list’ of operations that have been performed on the device to produce +the current plot (since the device was opened or the plot was last +cleared, e.g. by plot.new). Screen devices generally maintain +a display list to handle repaint and resize events whereas file-based +formats do not—display lists are also used to implement +dev.copy() and friends. The display list is a pairlist of +.Internal (base graphics) or .Call.graphics (grid +graphics) calls, which means that the C code implementing a graphics +operation will be re-called when the display list is replayed: apart +from the part which records the operation if successful. +

+

Snapshots of the current graphics state are taken by +GEcreateSnapshot and replayed later in the session by +GEplaySnapshot. These are used by recordPlot(), +replayPlot() and the GUI menus of the windows() device. +The ‘state’ includes the display list. +

+ +

The top layer comprises the graphics subsystems. Although there is +provision for 24 subsystems since about 2001, currently still only two +exist, ‘base’ and +‘grid’. The base subsystem is registered with the engine when R is +initialized, and unregistered (via KillAllDevices) when an R +session is shut down. The grid subsystem is registered in its +.onLoad function and unregistered in the .onUnload +function. The graphics subsystem may also have ‘state’ information +saved in a snapshot (currently base does and grid does not). +

+

Package grDevices was originally created to contain the basic +graphics devices (although X11 is in a separate load-on-demand +module because of the volume of external libraries it brings in). Since +then it has been used for other functionality that was thought desirable +for use with grid, and hence has been transferred from package +graphics to grDevices. This is principally concerned with +the handling of colours and recording and replaying plots. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: Graphics Devices   [Contents][Index]

+
+ +

6.1 Graphics Devices

+ +

R ships with several graphics devices, and there is support for +third-party packages to provide additional devices—several packages +now do. This section describes the device internals from the viewpoint +of a would-be writer of a graphics device. +

+ + + + + + + + + + +
+ + + +

6.1.1 Device structures

+ +

There are two types used internally which are pointers to structures +related to graphics devices. +

+

The DevDesc type is a structure defined in the header file +R_ext/GraphicsDevice.h (which is included by +R_ext/GraphicsEngine.h). This describes the physical +characteristics of a device, the capabilities of the device driver and +contains a set of callback functions that will be used by the graphics +engine to obtain information about the device and initiate actions +(e.g. a new page, plotting a line or some text). Type pDevDesc +is a pointer to this type. +

+

The following callbacks can be omitted (or set to the null pointer, +their default value) when appropriate default behaviour will be taken by +the graphics engine: activate, cap, deactivate, +locator, holdflush (API version 9), mode, +newFrameConfirm, path, raster and size. +

+

The relationship of device units to physical dimensions is set by the +element ipr of the DevDesc structure: a ‘double’ +array of length 2. +

+ +

The GEDevDesc type is a structure defined in +R_ext/GraphicsEngine.h (with comments in the file) as +

+
+
typedef struct _GEDevDesc GEDevDesc;
+struct _GEDevDesc {
+    pDevDesc dev;
+    Rboolean displayListOn;
+    SEXP displayList;
+    SEXP DLlastElt;
+    SEXP savedSnapshot;
+    Rboolean dirty;
+    Rboolean recordGraphics;
+    GESystemDesc *gesd[MAX_GRAPHICS_SYSTEMS];
+    Rboolean ask;
+}
+
+ +

So this is essentially a device structure plus information about the +device maintained by the graphics engine and normally17 visible to the engine +and not to the device. Type pGEDevDesc is a pointer to this +type. +

+

The graphics engine maintains an array of devices, as pointers to +GEDevDesc structures. The array is of size 64 but the first +element is always occupied by the "null device" and the final +element is kept as NULL as a sentinel.18 This array is reflected in the R variable +‘.Devices’. Once a device is killed its element becomes available +for reallocation (and its name will appear as "" in +‘.Devices’). Exactly one of the devices is ‘active’: this is the +the null device if no other device has been opened and not killed. +

+

Each instance of a graphics device needs to set up a GEDevDesc +structure by code very similar to +

+
+
    pGEDevDesc gdd;
+
+    R_GE_checkVersionOrDie(R_GE_version);
+    R_CheckDeviceAvailable();
+    BEGIN_SUSPEND_INTERRUPTS {
+        pDevDesc dev;
+        /* Allocate and initialize the device driver data */
+        if (!(dev = (pDevDesc) calloc(1, sizeof(DevDesc))))
+            return 0; /* or error() */
+        /* set up device driver or free ‘dev’ and error() */
+        gdd = GEcreateDevDesc(dev);
+        GEaddDevice2(gdd, "dev_name");
+    } END_SUSPEND_INTERRUPTS;
+
+ +

The DevDesc structure contains a void * pointer +‘deviceSpecific’ which is used to store data specific to the +device. Setting up the device driver includes initializing all the +non-zero elements of the DevDesc structure. +

+

Note that the device structure is zeroed when allocated: this provides +some protection against future expansion of the structure since the +graphics engine can add elements that need to be non-NULL/non-zero to be +‘on’ (and the structure ends with 64 reserved bytes which will be zeroed +and allow for future expansion). +

+

Rather more protection is provided by the version number of the +engine/device API, R_GE_version defined in +R_ext/GraphicsEngine.h together with access functions +

+
+
int R_GE_getVersion(void);
+void R_GE_checkVersionOrDie(int version);
+
+ +

If a graphics device calls R_GE_checkVersionOrDie(R_GE_version) +it can ensure it will only be used in versions of R which provide the +API it was designed for and compiled against. +

+
+ + + +

6.1.2 Device capabilities

+ +

The following ‘capabilities’ can be defined for the device’s +DevDesc structure. +

+
    +
  • canChangeGamma – +Rboolean: can the display gamma be adjusted? This is now +ignored, as gamma support has been removed. +
  • canHadj – +integer: can the device do horizontal adjustment of text +via the text callback, and if so, how precisely? 0 = no +adjustment, 1 = {0, 0.5, 1} (left, centre, right justification) or 2 = +continuously variable (in [0,1]) between left and right justification. +
  • canGenMouseDown – +Rboolean: can the device handle mouse down events? This +flag and the next three are not currently used by R, but are maintained +for back compatibility. +
  • canGenMouseMove – +Rboolean: ditto for mouse move events. +
  • canGenMouseUp – +Rboolean: ditto for mouse up events. +
  • canGenKeybd – +Rboolean: ditto for keyboard events. +
  • hasTextUTF8 – +Rboolean: should non-symbol text be sent (in UTF-8) to the +textUTF8 and strWidthUTF8 callbacks, and sent as Unicode +points (negative values) to the metricInfo callback? +
  • wantSymbolUTF8 – +Rboolean: should symbol text be handled in UTF-8 in the same way +as other text? Requires textUTF8 = TRUE. +
  • haveTransparency: +does the device support semi-transparent colours? +
  • haveTransparentBg: +can the background be fully or semi-transparent? +
  • haveRaster: +is there support for rendering raster images? +
  • haveCapture: +is there support for grid::grid.cap? +
  • haveLocator: +is there an interactive locator? +
+ +

The last three can often be deduced to be false from the presence of +NULL entries instead of the corresponding functions. +

+
+ + + +

6.1.3 Handling text

+ +

Handling text is probably the hardest task for a graphics device, and +the design allows for the device to optionally indicate that it has +additional capabilities. (If the device does not, these will if +possible be handled in the graphics engine.) +

+

The three callbacks for handling text that must be in all graphics +devices are text, strWidth and metricInfo with +declarations +

+
+
void text(double x, double y, const char *str, double rot, double hadj,
+          pGgcontext gc, pDevDesc dd);
+
+double strWidth(const char *str, pGEcontext gc, pDevDesc dd);
+
+void metricInfo(int c, pGEcontext gc,
+               double* ascent, double* descent, double* width,
+               pDevDesc dd);
+
+ +

The ‘gc’ parameter provides the graphics context, most importantly +the current font and fontsize, and ‘dd’ is a pointer to the active +device’s structure. +

+

The text callback should plot ‘str’ at ‘(x, +y)19 with an anti-clockwise rotation of +‘rot’ degrees. (For ‘hadj’ see below.) The interpretation +for horizontal text is that the baseline is at y and the start is +a x, so any left bearing for the first character will start at +x. +

+

The strWidth callback computes the width of the string which it +would occupy if plotted horizontally in the current font. (Width here +is expected to include both (preferably) or neither of left and right +bearings.) +

+

The metricInfo callback computes the size of a single +character: ascent is the distance it extends above the baseline +and descent how far it extends below the baseline. +width is the amount by which the cursor should be advanced when +the character is placed. For ascent and descent this is +intended to be the bounding box of the ‘ink’ put down by the glyph and +not the box which might be used when assembling a line of conventional +text (it needs to be for e.g. hat(beta) to work correctly). +However, the width is used in plotmath to advance to the next +character, and so needs to include left and right bearings. +

+

The interpretation of ‘c’ depends on the locale. In a +single-byte locale values 32...255 indicate the corresponding +character in the locale (if present). For the symbol font (as used by +‘graphics::par(font=5)’, ‘grid::gpar(fontface=5’) and by +‘plotmath’), values 32...126, 161...239, 241...254 indicate +glyphs in the Adobe Symbol encoding. In a multibyte locale, c +represents a Unicode point (except in the symbol font). So the function +needs to include code like +

+
+
    Rboolean Unicode = mbcslocale && (gc->fontface != 5);
+    if (c < 0) { Unicode = TRUE; c = -c; }
+    if(Unicode) UniCharMetric(c, ...); else CharMetric(c, ...);
+
+ +

In addition, if device capability hasTextUTF8 (see below) is +true, Unicode points will be passed as negative values: the code snippet +above shows how to handle this. (This applies to the symbol font only +if device capability wantSymbolUTF8 is true.) +

+

If possible, the graphics device should handle clipping of text. It +indicates this by the structure element canClip which if true +will result in calls to the callback clip to set the clipping +region. If this is not done, the engine will clip very crudely (by +omitting any text that does not appear to be wholly inside the clipping +region). +

+

The device structure has an integer element canHadj, which +indicates if the device can do horizontal alignment of text. If this is +one, argument ‘hadj’ to text will be called as 0 ,0.5, +1 to indicate left-, centre- and right-alignment at the indicated +position. If it is two, continuous values in the range [0, 1] +are assumed to be supported. +

+

Capability hasTextUTF8 if true, it has two consequences. +First, there are callbacks textUTF8 and strWidthUTF8 that +should behave identically to text and strWidth except that +‘str’ is assumed to be in UTF-8 rather than the current locale’s +encoding. The graphics engine will call these for all text except in +the symbol font. Second, Unicode points will be passed to the +metricInfo callback as negative integers. If your device would +prefer to have UTF-8-encoded symbols, define wantSymbolUTF8 as +well as hasTextUTF8. In that case text in the symbol font is +sent to textUTF8 and strWidthUTF8. +

+

Some devices can produce high-quality rotated text, but those based on +bitmaps often cannot. Those which can should set +useRotatedTextInContour to be true from graphics API version 4. +

+

Several other elements relate to the precise placement of text by the +graphics engine: +

+
+
double xCharOffset;
+double yCharOffset;
+double yLineBias;
+double cra[2];
+
+ +

These are more than a little mysterious. Element cra provides an +indication of the character size, par("cra") in base graphics, in +device units. The mystery is what is meant by ‘character size’: which +character, which font at which size? Some help can be obtained by +looking at what this is used for. The first element, ‘width’, is not +used by R except to set the graphical parameters. The second, +‘height’, is use to set the line spacing, that is the relationship +between par("mai") and par("mai") and so on. It is +suggested that a good choice is +

+
+
dd->cra[0] = 0.9 * fnsize;
+dd->cra[1] = 1.2 * fnsize;
+
+ +

where ‘fnsize’ is the ‘size’ of the standard font (cex=1) +on the device, in device units. So for a 12-point font (the usual +default for graphics devices), ‘fnsize’ should be 12 points in +device units. +

+

The remaining elements are yet more mysterious. The postscript() +device says +

+
+
    /* Character Addressing Offsets */
+    /* These offsets should center a single */
+    /* plotting character over the plotting point. */
+    /* Pure guesswork and eyeballing ... */
+
+    dd->xCharOffset =  0.4900;
+    dd->yCharOffset =  0.3333;
+    dd->yLineBias = 0.2;
+
+ +

It seems that xCharOffset is not currently used, and +yCharOffset is used by the base graphics system to set vertical +alignment in text() when pos is specified, and in +identify(). It is occasionally used by the graphic engine when +attempting exact centring of text, such as character string values of +pch in points() or grid.points()—however, it is +only used when precise character metric information is not available or +for multi-line strings. +

+

yLineBias is used in the base graphics system in axis() and +mtext() to provide a default for their ‘padj’ argument. +

+
+ +
+

+Next: , Previous: , Up: Graphics devices   [Contents][Index]

+
+ +

6.1.4 Conventions

+ +

The aim is to make the (default) output from graphics devices as similar +as possible. Generally people follow the model of the postscript +and pdf devices (which share most of their internal code). +

+

The following conventions have become established: +

+
    +
  • The default size of a device should be 7 inches square. + +
  • There should be a ‘pointsize’ argument which defaults to 12, and it +should give the pointsize in big points (1/72 inch). How exactly this +is interpreted is font-specific, but it should use a font which works +with lines packed 1/6 inch apart, and looks good with lines 1/5 inch +apart (that is with 2pt leading). + +
  • The default font family should be a sans serif font, e.g Helvetica or +similar (e.g. Arial on Windows). + +
  • lwd = 1 should correspond to a line width of 1/96 inch. This +will be a problem with pixel-based devices, and generally there is a +minimum line width of 1 pixel (although this may not be appropriate +where anti-aliasing of lines is used, and cairo prefers a minimum +of 2 pixels). + +
  • Even very small circles should be visible, e.g. by using a minimum +radius of 1 pixel or replacing very small circles by a single filled +pixel. + +
  • How RGB colour values will be interpreted should be documented, and +preferably be sRGB. + +
  • The help page should describe its policy on these conventions. + +
+ +

These conventions are less clear-cut for bitmap devices, especially +where the bitmap format does not have a design resolution. +

+

The interpretation of the line texture (par("lty") is described +in the header GraphicsEngine.h and in the help for par: note that the +‘scale’ of the pattern should be proportional to the line width (at +least for widths above the default). +

+ +
+ +
+

+Next: , Previous: , Up: Graphics devices   [Contents][Index]

+
+ +

6.1.5 ‘Mode’

+ +

One of the device callbacks is a function mode, documented in +the header as +

+
+
     * device_Mode is called whenever the graphics engine
+     * starts drawing (mode=1) or stops drawing (mode=0)
+     * GMode (in graphics.c) also says that
+     * mode = 2 (graphical input on) exists.
+     * The device is not required to do anything
+
+ +

Since mode = 2 has only recently been documented at device level. +It could be used to change the graphics cursor, but devices currently do +that in the locator callback. (In base graphics the mode is set +for the duration of a locator call, but if type != "n" is +switched back for each point whilst annotation is being done.) +

+

Many devices do indeed do nothing on this call, but some screen devices +ensure that drawing is flushed to the screen when called with mode += 0. It is tempting to use it for some sort of buffering, but note +that ‘drawing’ is interpreted at quite a low level and a typical single +figure will stop and start drawing many times. The buffering introduced +in the X11() device makes use of mode = 0 to indicate +activity: it updates the screen after ca 100ms of inactivity. +

+

This callback need not be supplied if it does nothing. +

+
+ +
+

+Next: , Previous: , Up: Graphics devices   [Contents][Index]

+
+ +

6.1.6 Graphics events

+ +

Graphics devices may be designed to handle user interaction: not all are. +

+

Users may use grDevices::setGraphicsEventEnv to set the +eventEnv environment in the device driver to hold event +handlers. When the user calls grDevices::getGraphicsEvent, R will +take three steps. First, it sets the device driver member +gettingEvent to true for each device with a +non-NULL eventEnv entry, and calls initEvent(dd, +true) if the callback is defined. It then enters an event loop. Each +time through the loop R will process events once, then check whether any +device has set the result member of eventEnv to a +non-NULL value, and will save the first such value found to be +returned. C functions doMouseEvent and doKeybd are +provided to call the R event handlers onMouseDown, +onMouseMove, onMouseUp, and onKeybd and set +eventEnv$result during this step. Finally, initEvent is +called again with init=false to inform the devices that the +loop is done, and the result is returned to the user. +

+
+ +
+

+Previous: , Up: Graphics devices   [Contents][Index]

+
+ +

6.1.7 Specific devices

+ +

Specific devices are mostly documented by comments in their sources, +although for devices of many years’ standing those comments can be in +need of updating. This subsection is a repository of notes on design +decisions. +

+ + + + + +
+ +
+

+Next: , Previous: , Up: Specific devices   [Contents][Index]

+
+ +

6.1.7.1 X11()

+ +

The X11(type="Xlib") device dates back to the mid 1990’s and was +written then in Xlib, the most basic X11 toolkit. It has since +optionally made use of a few features from other toolkits: libXt +is used to read X11 resources, and libXmu is used in the handling +of clipboard selections. +

+

Using basic Xlib code makes drawing fast, but is limiting. There +is no support of translucent colours (that came in the Xrender +toolkit of 2000) nor for rotated text (which R implements by +rendering text to a bitmap and rotating the latter). +

+

The hinting for the X11 window asks for backing store to be used, and +some windows managers may use it to handle repaints, but it seems that +most repainting is done by replaying the display list (and here the fast +drawing is very helpful). +

+

There are perennial problems with finding fonts. Many users fail to +realize that fonts are a function of the X server and not of the machine +that R is running on. After many difficulties, R tries first to +find the nearest size match in the sizes provided for Adobe fonts in the +standard 75dpi and 100dpi X11 font packages—even that will fail to +work when users of near-100dpi screens have only the 75dpi set +installed. The 75dpi set allows sizes down to 6 points on a 100dpi +screen, but some users do try to use smaller sizes and even 6 and 8 +point bitmapped fonts do not look good. +

+

Introduction of UTF-8 locales has caused another wave of difficulties. +X11 has very few genuine UTF-8 fonts, and produces composite fontsets +for the iso10646-1 encoding. Unfortunately these seem to have +low coverage apart from a few monospaced fonts in a few sizes (which are +not suitable for graph annotation), and where glyphs are missing what is +plotted is often quite unsatisfactory. +

+

The current approach is to make use of more modern toolkits, namely +cairo for rendering and Pango for font +management—because these are associated with Gtk+2 they are +widely available. Cairo supports translucent colours and alpha-blending +(via Xrender), and anti-aliasing for the display of lines +and text. Pango’s font management is based on fontconfig and +somewhat mysterious, but it seems mainly to use Type 1 and TrueType +fonts on the machine running R and send grayscale bitmaps to cairo. +

+ +
+ +
+

+Previous: , Up: Specific devices   [Contents][Index]

+
+ +

6.1.7.2 windows()

+ +

The windows() device is a family of devices: it supports plotting +to Windows (enhanced) metafiles, BMP, JPEG, PNG and +TIFF files as well as to Windows printers. +

+

In most of these cases the primary plotting is to a bitmap: this is used +for the (default) buffering of the screen device, which also enables the +current plot to be saved to BMP, JPEG, PNG or TIFF (it is the internal +bitmap which is copied to the file in the appropriate format). +

+

The device units are pixels (logical ones on a metafile device). +

+

The code was originally written by Guido Masarotto with extensive use of +macros, which can make it hard to disentangle. +

+

For a screen device, xd->gawin is the canvas of the screen, and +xd->bm is the off-screen bitmap. So macro DRAW arranges +to plot to xd->bm, and if buffering is off, also to +xd->gawin. For all other device, xd->gawin is the canvas, +a bitmap for the jpeg() and png() device, and an internal +representation of a Windows metafile for the win.metafile() and +win.print device. Since ‘plotting’ is done by Windows GDI calls +to the appropriate canvas, its precise nature is hidden by the GDI +system. +

+

Buffering on the screen device is achieved by running a timer, which +when it fires copies the internal bitmap to the screen. This is set to +fire every 500ms (by default) and is reset to 100ms after plotting +activity. +

+

Repaint events are handled by copying the internal bitmap to the screen +canvas (and then reinitializing the timer), unless there has been a resize. +Resizes are handled by replaying the display list: this might not be +necessary if a fixed canvas with scrollbars is being used, but that is +the least popular of the three forms of resizing. +

+

Text on the device has moved to ‘Unicode’ (UCS-2) in recent years. +UTF-8 is requested (hasTextUTF8 = TRUE) for standard text, and +converted to UCS-2 in the plotting functions in file +src/extra/graphapp/gdraw.c. However, GDI has no support for +Unicode symbol fonts, and symbols are handled in Adobe Symbol encoding. +

+

There is support for translucent colours (with alpha channel between 0 +and 255) was introduced on the screen device and bitmap +devices.20 This is done by drawing on a further internal bitmap, +xd->bm2, in the opaque version of the colour then alpha-blending +that bitmap to xd->bm. The alpha-blending routine is in a +separate DLL, msimg32.dll, which is loaded on first use. As +small a rectangular region as reasonably possible is alpha-blended (this +is rectangle r in the code), but things like mitre joins make +estimation of a tight bounding box too much work for lines and polygonal +boundaries. Translucent-coloured lines are not common, and the +performance seems acceptable. +

+

The support for a transparent background in png() predates full +alpha-channel support in libpng (let alone in PNG viewers), so +makes use of the limited transparency support in earlier versions of +PNG. Where 24-bit colour is used, this is done by marking a single +colour to be rendered as transparent. R chose ‘#fdfefd’, and +uses this as the background colour (in GA_NewPage if the +specified background colour is transparent (and all non-opaque +background colours are treated as transparent). So this works by +marking that colour in the PNG file, and viewers without transparency +support see a slightly-off-white background, as if there were a +near-white canvas. Where a palette is used in the PNG file (if less +than 256 colours were used) then this colour is recorded with full +transparency and the remaining colours as opaque. If 32-bit colour were +available then we could add a full alpha channel, but this is dependent +on the graphics hardware and undocumented properties of GDI. +

+ +
+ + + +

6.2 Colours

+ +

Devices receive colours as a typedef rcolor (an +unsigned int) defined in the header +R_ext/GraphicsEngine.h). The 4 bytes are R ,G, +B and alpha from least to most significant. So each of RGB +has 256 levels of luminosity from 0 to 255. The alpha byte represents +opacity, so value 255 is fully opaque and 0 fully transparent: many but +not all devices handle semi-transparent colours. +

+

Colors can be created in C via the macro R_RGBA, and a set of +macros are defined in R_ext/GraphicsDevice.h to extract the +various components. +

+

Colours in the base graphics system were originally adopted from S (and +before that the GRZ library from Bell Labs), with the concept of a +(variable-sized) palette of colours referenced by numbers +‘1...N’ plus ‘0’ (the background colour of the current +device). R introduced the idea of referring to colours by character +strings, either in the forms ‘#RRGGBB’ or ‘#RRGGBBAA’ +(representing the bytes in hex) as given by function rgb() or via +names: the 657 known names are given in the character vector +colors and in a table in file colors.c in package +grDevices. Note that semi-transparent colours are not +‘premultiplied’, so 50% transparent white is ‘#ffffff80’. +

+

Integer or character NA colours are mapped internally to +transparent white, as is the character string "NA". +

+

The handling of negative colour numbers was undefined (and inconsistent) +prior to R 3.0.0, which made them an error. Colours greater than +‘N’ are wrapped around, so that for example with the default +palette of size 8, colour ‘10’ is colour ‘2’ in the palette. +

+

Integer colours have been used more widely than the base graphics +sub-system, as they are supported by package grid and hence by +lattice and ggplot2. (They are also used by package +rgl.) grid did re-define colour ‘0’ to be +transparent white, but rgl used col2rgb and hence the +background colour of base graphics. +

+

Note that positive integer colours refer to the current palette and +colour ‘0’ to the current device (and a device is opened if needs +be). These are mapped to type rcolor at the time of use: this +matters when re-playing the display list, e.g. when a device is +resized or dev.copy is used. The palette should be thought of as +per-session: it is stored in package grDevices. +

+

The convention is that devices use the colorspace ‘sRGB’. This is an +industry standard: it is used by Web browsers and JPEGs from all but +high-end digital cameras. The interpretation is a matter for graphics +devices and for code that manipulates colours, but not for the graphics +engine or subsystems. +

+

R uses a painting model similar to PostScript and PDF. This means +that where shapes (circles, rectangles and polygons) can both be filled +and have a stroked border, the fill should be painted first and then the +border (or otherwise only half the border will be visible). Where both +the fill and the border are semi-transparent there is some room for +interpretation of the intention. Most devices first paint the fill and +then the border, alpha-blending at each step. However, PDF does some +automatic grouping of objects, and when the fill and the border +have the same alpha, they are painted onto the same layer and then +alpha-blended in one step. (See p. 569 of the PDF Reference Sixth +Edition, version 1.7. Unfortunately, although this is what the PDF +standard says should happen, it is not correctly implemented by some +viewers.) +

+

The mapping from colour numbers to type rcolor is primarily done +by function RGBpar3: this is exported from the R binary but +linked to code in package grDevices. The first argument is a +SEXP pointing to a character, integer or double vector, and the +second is the rcolor value for colour 0 (or "0"). +C entry point RGBpar is a wrapper that takes 0 to be +transparent white: it is often used to set colour defaults for devices. +The R-level wrapper is col2rgb. +

+

There is also R_GE_str2col which takes a C string and converts to +type rcolor: "0' is converted to transparent white. +

+

There is a R-level conversion of colours to ‘##RRGGBBAA’ by +image.default(useRaster = TRUE). +

+

The other color-conversion entry point in the API is name2col +which takes a colour name (a C string) and returns a value of type +rcolor. This handles "NA", "transparent" and the +657 colours known to the R function colors(). +

+
+ +
+

+Next: , Previous: , Up: Graphics Devices   [Contents][Index]

+
+ +

6.3 Base graphics

+ +

The base graphics system was migrated to package graphics in R +3.0.0: it was previously implemented in files in src/main. +

+

For historical reasons it is largely implemented in two layers. +Files plot.c, plot3d.c and par.c contain the code +for the around 30 .External calls that implement the basic +graphics operations. This code then calls functions with names starting +with G and declared in header Rgraphics.h in file +graphics.c, which in turn call the graphics engine (whose +functions almost all have names starting with GE). +

+

A large part of the infrastructure of the base graphics subsystem are +the graphics parameters (as set/read by par()). These are stored +in a GPar structure declared in the private header +Graphics.h. This structure has two variables (state and +valid) tracking the state of the base subsystem on the device, +and many variables recording the graphics parameters and functions of +them. +

+

The base system state is contained in baseSystemState structure +defined in R_ext/GraphicsBase.h. This contains three GPar +structures and a Boolean variable used to record if plot.new() +(or persp) has been used successfully on the device. +

+

The three copies of the GPar structure are used to store the +current parameters (accessed via gpptr), the ‘device copy’ +(accessed via dpptr) and space for a saved copy of the ‘device +copy’ parameters. The current parameters are, clearly, those currently +in use and are copied from the ‘device copy’ whenever plot.new() +is called (whether or not that advances to the next ‘page’). The saved +copy keeps the state when the device was last completely cleared (e.g. +when plot.new() was called with par(new=TRUE)), and is +used to replay the display list. +

+

The separation is not completely clean: the ‘device copy’ is altered if +a plot with log scale(s) is set up via plot.window(). +

+

There is yet another copy of most of the graphics parameters in +static variables in graphics.c which are used to preserve +the current parameters across the processing of inline parameters in +high-level graphics calls (handled by ProcessInlinePars). +

+

Snapshots of the base subsystem record the ‘saved device copy’ of the +GPar structure. +

+ + + + +
+ +
+

+Previous: , Up: Base graphics   [Contents][Index]

+
+ +

6.3.1 Arguments and parameters

+ +

There is an unfortunate confusion between some of the graphical +parameters (as set by par) and arguments to base graphic +functions of the same name. This description may help set the record +straight. +

+

Most of the high-level plotting functions accept graphical parameters as +additional arguments, which are then often passed to lower-level +functions if not already named arguments (which is the main source of +confusion). +

+

Graphical parameter bg is the background colour of the plot. +Argument bg refers to the fill colour for the filled symbols +21 to 25. It is an argument to the function +plot.xy, but normally passed by the default method of +points, often from a plot method. +

+

Graphics parameters cex, col, lty, lwd and +pch also appear as arguments of plot.xy and so are often +passed as arguments from higher-level plot functions such as +lines, points and plot methods. They appear as +arguments of legend, col, lty and lwd are +arguments of arrows and segments. When used as arguments +they can be vectors, recycled to control the various lines, points and +segments. When set a graphical parameters they set the default +rendering: in addition par(cex=) sets the overall character +expansion which subsequent calls (as arguments or on-line graphical +parameters) multiply. +

+

The handling of missing values differs in the two classes of uses. +Generally these are errors when used in par but cause the +corresponding element of the plot to be omitted when used as an element +of a vector argument. Originally the interpretation of arguments was +mainly left to the device, but as from R 3.0.0 some of this is +pre-empted in the graphics engine (but for example the handling of +lwd = 0 remains device-specific, with some interpreting it as a +‘thinnest possible’ line). +

+
+ +
+

+Previous: , Up: Graphics Devices   [Contents][Index]

+
+ +

6.4 Grid graphics

+ +

[At least pointers to documentation.] +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

7 GUI consoles

+ +

The standard R front-ends are programs which run in a terminal, but +there are several ways to provide a GUI console. +

+

This can be done by a package which is loaded from terminal-based R +and launches a console as part of its startup code or by the user +running a specific function: package Rcmdr is a well-known +example with a Tk-based GUI. +

+

There used to be a Gtk-based console invoked by R --gui=GNOME: +this relied on special-casing in the front-end shell script to launch a +different executable. There still is R --gui=Tk, which starts +terminal-based R and runs tcltk::tkStartGui() as part of the +modified startup sequence. +

+

However, the main way to run a GUI console is to launch a separate +program which runs embedded R: this is done by Rgui.exe on +Windows and R.app on OS X. The first is an integral part +of R and the code for the console is currently in R.dll. +

+ + + + +
+ +
+

+Previous: , Up: GUI consoles   [Contents][Index]

+
+ +

7.1 R.app

+ +

R.app is a OS X application which provides a console. Its +sources are a separate project21, and its binaries +link to an R installation which it runs as a dynamic library +libR.dylib. The standard CRAN distribution of R for +OS X bundles the GUI and R itself, but installing the GUI is optional +and either component can be updated separately. +

+

R.app relies on libR.dylib being in a specific place, +and hence on R having been built and installed as a Mac OS X +‘framework’. Specifically, it uses +/Library/Frameworks/R.framework/R. This is a symbolic link, as +frameworks can contain multiple versions of R. It eventually +resolves to +/Library/Frameworks/R.framework/Versions/Current/Resources/lib/libR.dylib, +which is (in the CRAN distribution) a ‘fat’ binary containing +multiple sub-architectures. +

+

OS X applications are directory trees: each R.app contains +a front-end written in Objective-C for one sub-architecture: in the +standard distribution there are separate applications for 32- and 64-bit +Intel architectures. +

+

Originally the R sources contained quite a lot of code used only by +the OS X GUI, but by R 3.0.0 this was been migrated to the +R.app sources. +

+

R.app starts R as an embedded application with a +command-line which includes --gui=aqua (see below). It uses +most of the interface pointers defined in the header +Rinterface.h, plus a private interface pointer in file +src/main/sysutils.c. It adds an environment +it names tools:RGUI to the second position in the search path. +This contains a number of utility functions used to support the menu +items, for example package.manager(), plus functions q() +and quit() which mask those in package base—the custom +versions save the history in a way specific to R.app. +

+

There is a configure option --with-aqua for R +which customizes the way R is built: this is distinct from the +--enable-R-framework option which causes make install +to install R as the framework needed for use with R.app. (The +option --with-aqua is the default on OS X.) It sets the +macro HAVE_AQUA in config.h and the make variable +BUILD_AQUA_TRUE. These have several consequences: +

+
    +
  • The quartz() device is built (other than as a stub) in package +grDevices: this needs an Objective-C compiler. Then +quartz() can be used with terminal R provided the latter has +access to the OS X screen. + +
  • File src/unix/aqua.c is compiled. This now only contains an +interface pointer for the quartz() device(s). + +
  • capabilities("aqua") is set to TRUE. + +
  • The default path for a personal library directory is set as +~/Library/R/x.y/library. + +
  • There is support for setting a ‘busy’ indicator whilst waiting for +system() to return. + +
  • R_ProcessEvents is inhibited in a forked child from package +parallel. The associated callback in R.app does things +which should not be done in a child, and forking forks the whole process +including the console. + +
  • There is support for starting the embedded R with the option +--gui=aqua: when this is done the global C variable +useaqua is set to a true value. This has consequences: + +
      +
    • The R session is asserted to be interactive via R_Interactive. + +
    • .Platform$GUI is set to "AQUA". That has consequences: +
        +
      • The environment variable DISPLAY is set to ‘:0’ if not +already set. + +
      • /usr/local/bin is appended to PATH since that is where +gfortran is installed. + +
      • The default HTML browser is switched to the one in R.app. + +
      • Various widgets are switched to the versions provided in +R.app: these include graphical menus, the data editor (but not +the data viewer used by View()) and the workspace browser invoked +by browseEnv(). + +
      • The grDevices package when loaded knows that it is being run +under R.app and so informs any quartz devices that a +Quartz event loop is already running. +
      + +
    • The use of the OS’s system function (including by system() +and system2(), and to launch editors and pagers) is replaced by a +version in R.app (which by default just calls the OS’s +system with various signal handlers reset). + +
    + +
  • If either R was started by --gui=aqua or R is running in +a terminal which is not of type ‘dumb’, the standard output to +files stdout and stderr is directed through the C function +Rstd_WriteConsoleEx. This uses ANSI terminal escapes to render +lines sent to stderr as bold on stdout. + +
  • For historical reasons the startup option -psn is allowed but +ignored. (It seems that in 2003, ‘r27492’, this was added by Finder.) + +
+ + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

8 Tools

+ +

The behavior of R CMD check can be controlled through a +variety of command line arguments and environment variables. +

+

There is an internal --install=value command line +argument not shown by R CMD check --help, with possible values +

+
+
check:file
+

Assume that installation was already performed with stdout/stderr to +file, the contents of which need to be checked (without repeating +the installation). This is useful for checks applied by repository +maintainers: it reduces the check time by the installation time given +that the package has already been installed. In this case, one also +needs to specify where the package was installed to using command +line option --library. +

+
fake
+

Fake installation, and turn off the run-time tests. +

+
skip
+

Skip installation, e.g., when testing recommended packages bundled with +R. +

+
no
+

The same as --no-install : turns off installation and the tests +which require the package to be installed. +

+
+ +

The following environment variables can be used to customize the +operation of check: a convenient place to set these is the +check environment file (default, ~/.R/check.Renviron). +

+
+
_R_CHECK_ALL_NON_ISO_C_ + +
+

If true, do not ignore compiler (typically GCC) warnings about non ISO C +code in system headers. Note that this may also show additional +ISO C++ warnings. +Default: false. +

+
_R_CHECK_FORCE_SUGGESTS_ + +
+

If true, give an error if suggested packages are not available. +Default: true (but false for CRAN submission checks). +

+
_R_CHECK_RD_CONTENTS_ + +
+

If true, check Rd files for auto-generated content which needs +editing, and missing argument documentation. +Default: true. +

+
_R_CHECK_RD_LINE_WIDTHS_ + +
+

If true, check Rd line widths in usage and examples sections. +Default: false (but true for CRAN submission checks). +

+
_R_CHECK_RD_STYLE_ + +
+

If true, check whether Rd usage entries for S3 methods use the full +function name rather than the appropriate \method markup. +Default: true. +

+
_R_CHECK_RD_XREFS_ + +
+

If true, check the cross-references in .Rd files. +Default: true. +

+
_R_CHECK_SUBDIRS_NOCASE_ + +
+

If true, check the case of directories such as R and man. +Default: true. +

+
_R_CHECK_SUBDIRS_STRICT_ + +
+

Initial setting for --check-subdirs. +Default: ‘default’ (which checks only tarballs, and checks in the +src only if there is no configure file). +

+
_R_CHECK_USE_CODETOOLS_ + +
+

If true, make use of the codetools package, which provides a +detailed analysis of visibility of objects (but may give false +positives). +Default: true (if recommended packages are installed). +

+
_R_CHECK_USE_INSTALL_LOG_ + +
+

If true, record the output from installing a package as part of its +check to a log file (00install.out by default), even when running +interactively. +Default: true. +

+
_R_CHECK_VIGNETTES_NLINES_ + +
+

Maximum number of lines to show at the bottom of the output when reporting +errors in running or re-building vignettes. +Default: 10 for running, 25 for re-building. +

+
_R_CHECK_CODOC_S4_METHODS_ + +
+

Control whether codoc() testing is also performed on S4 methods. +Default: true. +

+
_R_CHECK_DOT_INTERNAL_ + +
+

Control whether the package code is scanned for .Internal calls, +which should only be used by base (and occasionally by recommended) packages. +Default: true. +

+
_R_CHECK_EXECUTABLES_ + +
+

Control checking for executable (binary) files. +Default: true. +

+
_R_CHECK_EXECUTABLES_EXCLUSIONS_ + +
+

Control whether checking for executable (binary) files ignores files +listed in the package’s BinaryFiles file. +Default: true (but false for CRAN submission checks). +However, most likely this package-level override mechanism will be +removed eventually. +

+
_R_CHECK_PERMISSIONS_ + +
+

Control whether permissions of files should be checked. +Default: true iff .Platform$OS.type == "unix". +

+
_R_CHECK_FF_CALLS_ + +
+

Allows turning off checkFF() testing. If set to +‘registration’, checks the registration information (number of +arguments, correct choice of .C/.Fortran/.Call/.External) for +such calls provided the package is installed. +Default: true. +

+
_R_CHECK_FF_DUP_ + +
+

Controls checkFF(check_DUP) +Default: true (and forced to be true for CRAN submission checks). +

+
_R_CHECK_LICENSE_ + +
+

Control whether/how license checks are performed. A possible value is +‘maybe’ (warn in case of problems, but not about standardizable +non-standard license specs). +Default: true. +

+
_R_CHECK_RD_EXAMPLES_T_AND_F_ + +
+

Control whether check_T_and_F() also looks for “bad” (global) +‘T’/‘F’ uses in examples. +Off by default because this can result in false positives. +

+
_R_CHECK_RD_CHECKRD_MINLEVEL_ + +
+

Controls the minimum level for reporting warnings from checkRd. +Default: -1. +

+
_R_CHECK_XREFS_REPOSITORIES_ + +
+

If set to a non-empty value, a space-separated list of repositories to +use to determine known packages. Default: empty, when the CRAN, +Omegahat and Bioconductor repositories known to R is used. +

+
_R_CHECK_SRC_MINUS_W_IMPLICIT_ + +
+

Control whether installation output is checked for compilation warnings +about implicit function declarations (as spotted by GCC with command +line option -Wimplicit-function-declaration, which is implied +by -Wall). +Default: false. +

+
_R_CHECK_SRC_MINUS_W_UNUSED_ + +
+

Control whether installation output is checked for compilation warnings +about unused code constituents (as spotted by GCC with command line +option -Wunused, which is implied by -Wall). +Default: true. +

+
_R_CHECK_WALL_FORTRAN_ + +
+

Control whether gfortran 4.0 or later -Wall warnings are used in +the analysis of installation output. +Default: false, even though the warnings are justifiable. +

+
_R_CHECK_ASCII_CODE_ + +
+

If true, check R code for non-ascii characters. +Default: true. +

+
_R_CHECK_ASCII_DATA_ + +
+

If true, check data for non-ascii characters. +Default: true. +

+
_R_CHECK_COMPACT_DATA_ + +
+

If true, check data for ascii and uncompressed saves, and also check if +using bzip2 or xz compression would be significantly +better. +Default: true. +

+
_R_CHECK_SKIP_ARCH_ + +
+

Comma-separated list of architectures that will be omitted from +checking in a multi-arch setup. +Default: none. +

+
_R_CHECK_SKIP_TESTS_ARCH_ + +
+

Comma-separated list of architectures that will be omitted from +running tests in a multi-arch setup. +Default: none. +

+
_R_CHECK_SKIP_EXAMPLES_ARCH_ + +
+

Comma-separated list of architectures that will be omitted from +running examples in a multi-arch setup. +Default: none. +

+
_R_CHECK_VC_DIRS_ + +
+

Should the unpacked package directory be checked for version-control +directories (CVS, .svn …)? +Default: true for tarballs. +

+
_R_CHECK_PKG_SIZES_ + +
+

Should du be used to find the installed sizes of packages? +R CMD check does check for the availability of du. +but this option allows the check to be overruled if an unsuitable +command is found (including one that does not respect the -k +flag to report in units of 1Kb, or reports in a different format – the +GNU, OS X and Solaris du commands have been tested). +Default: true if du is found. +

+
_R_CHECK_DOC_SIZES_ + +
+

Should qpdf be used to check the installed sizes of PDFs? +Default: true if qpdf is found. +

+
_R_CHECK_DOC_SIZES2_ + +
+

Should gs be used to check the installed sizes of PDFs? This +is slower than (and in addition to) the previous check, but does detect +figures with excessive detail (often hidden by over-plotting) or bitmap +figures with too high a resolution. Requires that R_GSCMD is set +to a valid program, or gs (or on Windows, +gswin32.exe or gswin64c.exe) is on the path. +Default: false (but true for CRAN submission checks). +

+
_R_CHECK_ALWAYS_LOG_VIGNETTE_OUTPUT_ + +
+

By default the output from running the R code in the vignettes is +kept only if there is an error. +Default: false. +

+
_R_CHECK_CLEAN_VIGN_TEST_ + +
+

Should the vign_test directory be removed if the test is successful? +Default: true. +

+
_R_CHECK_REPLACING_IMPORTS_ + +
+

Should warnings about replacing imports be reported? These sometimes come +from auto-generated NAMESPACE files in other packages, but most +often from importing the whole of a namespace rather than using +importFrom. +Default: false (but true for CRAN submission checks). +

+
_R_CHECK_UNSAFE_CALLS_ + +
+

Check for calls that appear to tamper with (or allow tampering with) +already loaded code not from the current package: such calls may well +contravene CRAN policies. +Default: true. +

+
_R_CHECK_TIMINGS_ + +
+

Optionally report timings for installation, examples, tests and +running/re-building vignettes as part of the check log. The format is +‘[as/bs]’ for the total CPU time (including child processes) +‘a’ and elapsed time ‘b’, except on Windows, when it is +‘[bs]’. In most cases timings are only given for ‘OK’ checks. +Times with an elapsed component over 10 mins are reported in minutes +(with abbreviation ‘m’). The value is the smallest numerical value +in elapsed seconds that should be reported: non-numerical values +indicate that no report is required, a value of ‘0’ that a report +is always required. +Default: "". (10 for CRAN checks.) +

+
+
_R_CHECK_INSTALL_DEPENDS_ + +
+

If set to a true value and a test installation is to be done, this is +done with .libPaths() containing just a temporary library +directory and .Library. The temporary library is populated by +symbolic links22 +to the installed copies of all the Depends/Imports/LinkingTo packages +which are not in .Library. Default: false (but true for CRAN +submission checks). +

+

Note that this is actually implemented in R CMD INSTALL, so it +is available to those who first install recording to a log, then call +R CMD check. +

+
+
_R_CHECK_DEPENDS_ONLY_ + +
+
_R_CHECK_SUGGESTS_ONLY_ + +
+

If set to a true value, running examples, tests and vignettes is done +with .libPaths() containing just a temporary library directory +and .Library. The temporary library is populated by symbolic +links23 to the installed copies of +all the Depends/Imports and (for the second only) Suggests packages +which are not in .Library. (As an exception, packages in a +‘VignetteBuilder’ field are always made available.) +Default: false (but _R_CHECK_SUGGESTS_ONLY_ is true for CRAN checks). +

+
+
_R_CHECK_NO_RECOMMENDED_ + +
+

If set to a true value, augment the previous checks to make recommended +packages unavailable unless declared. +Default: false (but true for CRAN submission checks). +

+

This may give false positives on code which uses +grDevices::densCols and stats:::asSparse as these invoke +KernSmooth and Matrix respectively. +

+
+
_R_CHECK_CODETOOLS_PROFILE_ + +
+

A string with comma-separated name=value pairs (with +value a logical constant) giving additional arguments for the +codetools functions used for analyzing package code. E.g., +use _R_CHECK_CODETOOLS_PROFILE_="suppressLocalUnused=FALSE" to +turn off suppressing warnings about unused local variables. Default: no +additional arguments, corresponding to using skipWith = TRUE, +suppressPartialMatchArgs = FALSE and suppressLocalUnused = +TRUE. +

+
+
_R_CHECK_CRAN_INCOMING_ + +
+

Check whether package is suitable for publication on CRAN. +Default: false, except for CRAN submission checks. +

+
+
_R_CHECK_XREFS_USE_ALIASES_FROM_CRAN_ + +
+

When checking anchored Rd xrefs, use Rd aliases from the CRAN package +web areas in addition to those in the packages installed locally. +Default: false. +

+
+
_R_SHLIB_BUILD_OBJECTS_SYMBOL_TABLES_ + +
+

Make the checks of compiled code more accurate by recording the symbol +tables for objects (.o files) at installation in a file +symbols.rds. (Only currently supported on Linux, Solaris, OS X, +Windows and FreeBSD.) +Default: true. +

+
+
_R_CHECK_CODE_ASSIGN_TO_GLOBALENV_ + +
+

Should the package code be checked for assignments to the global +environment? +Default: false (but true for CRAN submission checks). +

+
+
_R_CHECK_CODE_ATTACH_ + +
+

Should the package code be checked for calls to attach()? +Default: false (but true for CRAN submission checks). +

+
+
_R_CHECK_CODE_DATA_INTO_GLOBALENV_ + +
+

Should the package code be checked for calls to data() which load +into the global environment? +Default: false (but true for CRAN submission checks). +

+
+
_R_CHECK_DOT_FIRSTLIB_ + +
+

Should the package code be checked for the presence of the obsolete function +.First.lib()? +Default: false (but true for CRAN submission checks). +

+
+
_R_CHECK_DEPRECATED_DEFUNCT_ + +
+

Should the package code be checked for the presence of recently deprecated +or defunct functions (including completely removed functions). Also for +platform-specific graphics devices. +Default: false (but true for CRAN submission checks). +

+
+
_R_CHECK_SCREEN_DEVICE_ + +
+

If set to ‘warn’, give a warning if examples etc open a screen +device. If set to ‘stop’, give an error. +Default: empty (but ‘stop’ for CRAN submission checks). +

+
+
_R_CHECK_WINDOWS_DEVICE_ + +
+

If set to ‘stop’, give an error if a Windows-only device is used in +example etc. This is only useful on Windows: the devices do not exist +elsewhere. +Default: empty (but ‘stop’ for CRAN submission checks on Windows). +

+
+
_R_CHECK_TOPLEVEL_FILES_ + +
+

Report on top-level files in the package sources that are not described +in ‘Writing R Extensions’ nor are commonly understood (like +ChangeLog). Variations on standard names (e.g. +COPYRIGHT) are also reported. +Default: false (but true for CRAN submission checks). +

+
+
_R_CHECK_GCT_N_ + +
+

Should the --use-gct use gctorture2(n) rather than +gctorture(TRUE)? Use to a positive integer to enable this. +Default: 0. +

+
+
_R_CHECK_LIMIT_CORES_ + +
+

If set, check the usage of too many cores in package parallel. If +set to ‘warn’ gives a warning, to ‘false’ or ‘FALSE’ the +check is skipped, and any other non-empty value gives an error when more +than 2 children are spawned. +Default: unset (but ‘TRUE’ for CRAN submission checks). +

+
+
_R_CHECK_CODE_USAGE_VIA_NAMESPACES_ + +
+

If set, check code usage (via codetools) directly on the +package namespace without loading and attaching the package and its +suggests and enhances. +Default: true (and true for CRAN submission checks). +

+
+
_R_CHECK_EXIT_ON_FIRST_ERROR_ + +
+

If set to a true value, the check will exit on the first error. +Default: false. +

+
+
_R_CHECK_S3_METHODS_NOT_REGISTERED_ + +
+

If set to a true value, report (apparent) S3 methods exported but not +registered. +Default: false (but true for CRAN submission checks). +

+
+
_R_CHECK_OVERWRITE_REGISTERED_S3_METHODS_ + +
+

If set to a true value, report already registered S3 methods in +base/recommended packages which are overwritten when this package’s +namespace is loaded. +Default: false (but true for CRAN submission checks). +

+
+ +

CRAN’s submission checks use something like +

+
+
_R_CHECK_CRAN_INCOMING_=TRUE
+_R_CHECK_VC_DIRS_=TRUE
+_R_CHECK_TIMINGS_=10
+_R_CHECK_INSTALL_DEPENDS_=TRUE
+_R_CHECK_SUGGESTS_ONLY_=TRUE
+_R_CHECK_NO_RECOMMENDED_=TRUE
+_R_CHECK_EXECUTABLES_EXCLUSIONS_=FALSE
+_R_CHECK_DOC_SIZES2_=TRUE
+_R_CHECK_CODE_ASSIGN_TO_GLOBALENV_=TRUE
+_R_CHECK_CODE_ATTACH_=TRUE
+_R_CHECK_CODE_DATA_INTO_GLOBALENV_=TRUE
+_R_CHECK_CODE_USAGE_VIA_NAMESPACES_=TRUE
+_R_CHECK_DOT_FIRSTLIB_=TRUE
+_R_CHECK_DEPRECATED_DEFUNCT_=TRUE
+_R_CHECK_REPLACING_IMPORTS_=TRUE
+_R_CHECK_SCREEN_DEVICE_=stop
+_R_CHECK_TOPLEVEL_FILES_=TRUE
+_R_CHECK_S3_METHODS_NOT_REGISTERED_=TRUE
+_R_CHECK_OVERWRITE_REGISTERED_S3_METHODS_=TRUE
+
+ +

These are turned on by R CMD check --as-cran: the incoming +checks also use +

+
_R_CHECK_FORCE_SUGGESTS_=FALSE
+
+ +

since some packages do suggest other packages not available on CRAN or +other commonly-used repositories. +

+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

9 R coding standards

+ + +

R is meant to run on a wide variety of platforms, including Linux and +most variants of Unix as well as Windows and OS X. +Therefore, when extending R by either adding to the R base +distribution or by providing an add-on package, one should not rely on +features specific to only a few supported platforms, if this can be +avoided. In particular, although most R developers use GNU +tools, they should not employ the GNU extensions to standard +tools. Whereas some other software packages explicitly rely on e.g. +GNU make or the GNU C++ compiler, R does not. +Nevertheless, R is a GNU project, and the spirit of the +GNU Coding Standards should be followed if possible. +

+

The following tools can “safely be assumed” for R extensions. +

+
    +
  • An ISO C99 C compiler. Note that extensions such as POSIX +1003.1 must be tested for, typically using Autoconf unless you are sure +they are supported on all mainstream R platforms (including Windows +and OS X). + +
  • A FORTRAN 77 compiler (but not Fortran 9x, although it is nowadays +widely available). + +
  • A simple make, considering the features of make in +4.2 BSD systems as a baseline. + + +

    GNU or other extensions, including pattern rules using +‘%’, the automatic variable ‘$^’, the ‘+=’ syntax to +append to the value of a variable, the (“safe”) inclusion of makefiles +with no error, conditional execution, and many more, must not be used +(see Chapter “Features” in the GNU Make Manual for +more information). On the other hand, building R in a separate +directory (not containing the sources) should work provided that +make supports the VPATH mechanism. +

    +

    Windows-specific makefiles can assume GNU make 3.79 +or later, as no other make is viable on that platform. +

    +
  • A Bourne shell and the “traditional” Unix programming tools, including +grep, sed, and awk. + +

    There are POSIX standards for these tools, but these may not +be fully supported. Baseline features could be determined from a book +such as The UNIX Programming Environment by Brian W. Kernighan & +Rob Pike. Note in particular that ‘|’ in a regexp is an extended +regexp, and is not supported by all versions of grep or +sed. The Open Group Base Specifications, Issue 7, which are +technically identical to IEEE Std 1003.1 (POSIX), 2008, +are available at +http://pubs.opengroup.org/onlinepubs/9699919799/mindex.html. +

+ +

Under Windows, most users will not have these tools installed, and you +should not require their presence for the operation of your package. +However, users who install your package from source will have them, as +they can be assumed to have followed the instructions in “the Windows +toolset” appendix of the “R Installation and Administration” manual +to obtain them. Redirection cannot be assumed to be available via +system as this does not use a standard shell (let alone a +Bourne shell). +

+

In addition, the following tools are needed for certain tasks. +

+
    +
  • Perl version 5 is only needed for a few uncommonly-used tools: make +install-info needs Perl installed if there is no command +install-info on the system, and for the maintainer-only script +tools/help2man.pl. + + +
  • Makeinfo version 4.7 or later is needed to build the Info files for the +R manuals written in the GNU Texinfo system. + +
+ +

It is also important that code is written in a way that allows others to +understand it. This is particularly helpful for fixing problems, and +includes using self-descriptive variable names, commenting the code, and +also formatting it properly. The R Core Team recommends to use a +basic indentation of 4 for R and C (and most likely also Perl) code, +and 2 for documentation in Rd format. Emacs (21 or later) users can +implement this indentation style by putting the following in one of +their startup files, and using customization to set the +c-default-style to "bsd" and c-basic-offset to +4.) + +

+
+
;;; ESS
+(add-hook 'ess-mode-hook
+          (lambda ()
+            (ess-set-style 'C++ 'quiet)
+            ;; Because
+            ;;                                 DEF GNU BSD K&R C++
+            ;; ess-indent-level                  2   2   8   5   4
+            ;; ess-continued-statement-offset    2   2   8   5   4
+            ;; ess-brace-offset                  0   0  -8  -5  -4
+            ;; ess-arg-function-offset           2   4   0   0   0
+            ;; ess-expression-offset             4   2   8   5   4
+            ;; ess-else-offset                   0   0   0   0   0
+            ;; ess-close-brace-offset            0   0   0   0   0
+            (add-hook 'local-write-file-hooks
+                      (lambda ()
+                        (ess-nuke-trailing-whitespace)))))
+(setq ess-nuke-trailing-whitespace-p 'ask)
+;; or even
+;; (setq ess-nuke-trailing-whitespace-p t)
+
;;; Perl
+(add-hook 'perl-mode-hook
+          (lambda () (setq perl-indent-level 4)))
+
+ +

(The ‘GNU’ styles for Emacs’ C and R modes use a basic indentation of 2, +which has been determined not to display the structure clearly enough +when using narrow fonts.) +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

10 Testing R code

+ +

When you (as R developer) add new functions to the R base (all the +packages distributed with R), be careful to check if make +test-Specific or particularly, cd tests; make no-segfault.Rout +still works (without interactive user intervention, and on a standalone +computer). If the new function, for example, accesses the Internet, or +requires GUI interaction, please add its name to the “stop +list” in tests/no-segfault.Rin. +

+

[To be revised: use make check-devel, check the write barrier +if you change internal structures.] +

+
+ + + +

11 Use of TeX dialects

+ +

Various dialects of TeX and used for different purposes in R. The +policy is that manuals be written in ‘texinfo’, and for convenience +the main and Windows FAQs are also. This has the advantage that is is +easy to produce HTML and plain text versions as well as typeset manuals. +

+

LaTeX is not used directly, but rather as an intermediate format for +typeset help documents and for vignettes. +

+

Care needs to be taken about the assumptions made about the R user’s +system: it may not have either ‘texinfo’ or a TeX system +installed. We have attempted to abstract out the cross-platform +differences, and almost all the setting of typeset documents is done by +tools::texi2dvi. This is used for offline printing of help +documents, preparing vignettes and for package manuals via R +CMD Rd2pdf. It is not currently used for the R manuals created in +directory doc/manual. +

+

tools::texi2dvi makes use of a system command texi2dvi +where available. On a Unix-alike this is usually part of +‘texinfo’, whereas on Windows if it exists at all it would be an +executable, part of MiKTeX. If none is available, the R code runs +a sequence of (pdf)latex, bibtex and +makeindex commands. +

+

This process has been rather vulnerable to the versions of the external +software used: particular issues have been texi2dvi and +texinfo.tex updates, mismatches between the two24, +versions of the LaTeX package ‘hyperref’ and quirks in index +production. The licenses used for LaTeX and latterly ‘texinfo’ +prohibit us from including ‘known good’ versions in the R +distribution. +

+

On a Unix-alike configure looks for the executables for TeX and +friends and if found records the absolute paths in the system +Renviron file. This used to record ‘false’ if no command +was found, but it nowadays records the name for looking up on the path +at run time. The latter can be important for binary distributions: one +does not want to be tied to, for example, TeX Live 2007. +

+ +
+ + + +

12 Current and future directions

+ +

This chapter is for notes about possible in-progress and future changes +to R: there is no commitment to release such changes, let alone to a +timescale. +

+ + + + + + +
+ + + +

12.1 Long vectors

+ +

Vectors in R 2.x.y were limited to a length of 2^31 - 1 elements +(about 2 billion), as the length is stored in the SEXPREC as a C +int, and that type is used extensively to record lengths and +element numbers, including in packages. +

+

Note that longer vectors are effectively impossible under 32-bit +platforms because of their address limit, so this section applies only +on 64-bit platforms. The internals are unchanged on a 32-bit build of +R. +

+

A single object with 2^31 or more elements will take up at least 8GB of +memory if integer or logical and 16GB if numeric or character, so +routine use of such objects is still some way off. +

+

There is now some support for long vectors. This applies to raw, +logical, integer, numeric and character vectors, and lists and +expression vectors. (Elements of character vectors (CHARSXPs) +remain limited to 2^31 - 1 bytes.) Some considerations: +

+ +
    +
  • This has been implemented by recording the length (and true length) as +-1 and recording the actual length as a 64-bit field at the +beginning of the header. Because a fair amount of code in R uses a +signed type for the length, the ‘long length’ is recorded using the +signed C99 type ptrdiff_t, which is typedef-ed to +R_xlen_t. + +
  • These can in theory have 63-bit lengths, but note that current 64-bit +OSes do not even theoretically offer 64-bit address spaces and there is +currently a 52-bit limit (which exceeds the theoretical limit of current +OSes and ensures that such lengths can be stored exactly in doubles). + +
  • The serialization format has been changed to accommodate longer lengths, +but vectors of lengths up to 2^31-1 are stored in the same way as +before. Longer vectors have their length field set to -1 and +followed by two 32-bit fields giving the upper and lower 32-bits of the +actual length. There is currently a sanity check which limits lengths +to 2^48 on unserialization. + +
  • The type R_xlen_t is made available to packages in C header +Rinternals.h: this should be fine in C code since C99 is +required. People do try to use R internals in C++, but C++98 +compilers are not required to support these types. + +
  • Indexing can be done via the use of doubles. The internal indexing code +used to work with positive integer indices (and negative, logical and +matrix indices were all converted to positive integers): it now works +with either INTSXP or REALSXP indices. + +
  • R function length was documented to currently return an +integer, possibly NA. A lot of code has been written that +assumes that, and even code which calls as.integer(length(x)) +before passing to .C/.Fortran rarely checks for an +NA result. + +

    There is a new function xlength which works for long vectors and +returns a double value if the length exceeds 2^31-1. At present +length returns NA for long vectors, but it may be safer to +make that an error. +

    +
+ +
+ + + +

12.2 64-bit types

+ +

There is also some desire to be able to store larger integers in R, +although the possibility of storing these as double is often +overlooked (and e.g. file pointers as returned by seek are +already stored as double). +

+

Different routes have been proposed: +

+
    +
  • Add a new type to R and use that for lengths and indices—most likely +this would be a 64-bit signed type, say longint. R’s usual +implicit coercion rules would ensure that supplying an integer +vector for indexing or length<- would work. + +
  • A more radical alternative is to change the existing integer type +to be 64-bit on 64-bit platforms (which was the approach taken by S-PLUS +for DEC/Compaq Alpha systems). Or even on all platforms. + +
  • Allow either integer or double values for lengths and +indices, and return double only when necessary. + +
+ +

The third has the advantages of minimal disruption to existing code and +not increasing memory requirements. In the first and third scenarios +both R’s own code and user code would have to be adapted for lengths +that were not of type integer, and in the third code branches for +long vectors would be tested rarely. +

+

Most users of the .C and .Fortran interfaces use +as.integer for lengths and element numbers, but a few omit these +in the knowledge that these were of type integer. It may be +reasonable to assume that these are never intended to be used with long +vectors. +

+

The remaining interfaces will need to cope with the changed +VECTOR_SEXPREC types. It seems likely that in most cases lengths +are accessed by the length and LENGTH +functions25 The current approach is to keep these returning 32-bit lengths and +introduce ‘long’ versions xlength and XLENGTH which return +R_xlen_t values. +

+ +

See also http://homepage.cs.uiowa.edu/~luke/talks/useR10.pdf. +

+
+ + + +

12.3 Large matrices

+ +

Matrices are stored as vectors and so were also limited to 2^31-1 +elements. Now longer vectors are allowed on 64-bit platforms, matrices +with more elements are supported provided that each of the dimensions is +no more than 2^31-1. However, not all applications can be supported. +

+

The main problem is linear algebra done by FORTRAN code compiled +with 32-bit INTEGER. Although not guaranteed, it seems that all +the compilers currently used with R on a 64-bit platform allow +matrices each of whose dimensions is less than 2^31 but with more than +2^31 elements, and index them correctly, and a substantial part of the +support software (such as BLAS and LAPACK) also +work. +

+

There are exceptions: for example some complex LAPACK +auxiliary routines do use a single INTEGER index and hence +overflow silently and segfault or give incorrect results. One example +is svd() on a complex matrix. +

+

Since this is implementation-dependent, it is possible that optimized +BLAS and LAPACK may have further restrictions, +although none have yet been encountered. For matrix algebra on large +matrices one almost certainly wants a machine with a lot of RAM (100s of +gigabytes), many cores and a multi-threaded BLAS. +

+ + +
+ + + +

Function and variable index

+ +
Jump to:   . +   +_ +   +
+A +   +C +   +D +   +E +   +F +   +G +   +I +   +L +   +M +   +N +   +P +   +R +   +S +   +T +   +U +   +V +   +W +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

.
.Device: Base environment
.Devices: Base environment
.Internal: .Internal vs .Primitive
.Last.value: Base environment
.Options: Base environment
.Primitive: .Internal vs .Primitive
.Random.seed: Global environment
.SavedPlots: Global environment
.Traceback: Base environment

_
_R_CHECK_ALL_NON_ISO_C_: Tools
_R_CHECK_ALWAYS_LOG_VIGNETTE_OUTPUT_: Tools
_R_CHECK_ASCII_CODE_: Tools
_R_CHECK_ASCII_DATA_: Tools
_R_CHECK_CLEAN_VIGN_TEST_: Tools
_R_CHECK_CODETOOLS_PROFILE_: Tools
_R_CHECK_CODE_ASSIGN_TO_GLOBALENV_: Tools
_R_CHECK_CODE_ATTACH_: Tools
_R_CHECK_CODE_DATA_INTO_GLOBALENV_: Tools
_R_CHECK_CODE_USAGE_VIA_NAMESPACES_: Tools
_R_CHECK_CODOC_S4_METHODS_: Tools
_R_CHECK_COMPACT_DATA_: Tools
_R_CHECK_CRAN_INCOMING_: Tools
_R_CHECK_DEPENDS_ONLY_: Tools
_R_CHECK_DEPRECATED_DEFUNCT_: Tools
_R_CHECK_DOC_SIZES2_: Tools
_R_CHECK_DOC_SIZES_: Tools
_R_CHECK_DOT_FIRSTLIB_: Tools
_R_CHECK_DOT_INTERNAL_: Tools
_R_CHECK_EXECUTABLES_: Tools
_R_CHECK_EXECUTABLES_EXCLUSIONS_: Tools
_R_CHECK_EXIT_ON_FIRST_ERROR_: Tools
_R_CHECK_FF_CALLS_: Tools
_R_CHECK_FF_DUP_: Tools
_R_CHECK_FORCE_SUGGESTS_: Tools
_R_CHECK_GCT_N_: Tools
_R_CHECK_INSTALL_DEPENDS_: Tools
_R_CHECK_LICENSE_: Tools
_R_CHECK_LIMIT_CORES_: Tools
_R_CHECK_NO_RECOMMENDED_: Tools
_R_CHECK_OVERWRITE_REGISTERED_S3_METHODS_: Tools
_R_CHECK_PERMISSIONS_: Tools
_R_CHECK_PKG_SIZES_: Tools
_R_CHECK_RD_CHECKRD_MINLEVEL_: Tools
_R_CHECK_RD_CONTENTS_: Tools
_R_CHECK_RD_EXAMPLES_T_AND_F_: Tools
_R_CHECK_RD_LINE_WIDTHS_: Tools
_R_CHECK_RD_STYLE_: Tools
_R_CHECK_RD_XREFS_: Tools
_R_CHECK_REPLACING_IMPORTS_: Tools
_R_CHECK_S3_METHODS_NOT_REGISTERED_: Tools
_R_CHECK_SCREEN_DEVICE_: Tools
_R_CHECK_SKIP_ARCH_: Tools
_R_CHECK_SKIP_EXAMPLES_ARCH_: Tools
_R_CHECK_SKIP_TESTS_ARCH_: Tools
_R_CHECK_SRC_MINUS_W_IMPLICIT_: Tools
_R_CHECK_SRC_MINUS_W_UNUSED_: Tools
_R_CHECK_SUBDIRS_NOCASE_: Tools
_R_CHECK_SUBDIRS_STRICT_: Tools
_R_CHECK_SUGGESTS_ONLY_: Tools
_R_CHECK_TIMINGS_: Tools
_R_CHECK_TOPLEVEL_FILES_: Tools
_R_CHECK_UNSAFE_CALLS_: Tools
_R_CHECK_USE_CODETOOLS_: Tools
_R_CHECK_USE_INSTALL_LOG_: Tools
_R_CHECK_VC_DIRS_: Tools
_R_CHECK_VIGNETTES_NLINES_: Tools
_R_CHECK_WALL_FORTRAN_: Tools
_R_CHECK_WINDOWS_DEVICE_: Tools
_R_CHECK_XREFS_REPOSITORIES_: Tools
_R_CHECK_XREFS_USE_ALIASES_FROM_CRAN_: Tools
_R_SHLIB_BUILD_OBJECTS_SYMBOL_TABLES_: Tools

A
alloca: Memory allocators
ARGSUSED: Rest of header
ATTRIB: Attributes
attribute_hidden: Hiding C entry points

C
Calloc: Memory allocators
copyMostAttributes: Attributes

D
DDVAL: Rest of header
debug bit: Rest of header
DispatchGeneric: Argument evaluation
DispatchOrEval: Argument evaluation
dump.frames: Global environment
DUPLICATE_ATTRIB: Attributes

E
emacs: R coding standards
error: Warnings and errors
errorcall: Warnings and errors

F
Free: Memory allocators

G
gp bits: Rest of header

I
invisible: Autoprinting

L
last.warning: Base environment
LEVELS: Rest of header

M
make: R coding standards
makeinfo: R coding standards
MISSING: Rest of header
MISSING: Missingness
mkChar: The CHARSXP cache
mkCharLenCE: The CHARSXP cache

N
NAMED: Rest of header
NAMED: Argument evaluation
NAMED: .Internal vs .Primitive
named bit: Rest of header

P
Perl: R coding standards
PRIMPRINT: Autoprinting
PRSEEN: Rest of header

R
Rdll.hide: Hiding C entry points
Realloc: Memory allocators
R_alloc: Memory allocators
R_AllocStringBuffer: Memory allocators
R_BaseNamespace: Namespaces
R_CheckStack: Memory allocators
R_CheckStack2: Memory allocators
R_FreeStringBuffer: Memory allocators
R_FreeStringBufferL: Memory allocators
R_MissingArg: Missingness
R_Visible: Autoprinting

S
SETLEVELS: Rest of header
SET_ARGUSED: Rest of header
SET_ATTRIB: Attributes
SET_DDVAL: Rest of header
SET_MISSING: Rest of header
SET_NAMED: Rest of header
spare bit: Rest of header

T
trace bit: Rest of header

U
UseMethod: Contexts

V
vmaxget: Memory allocators
vmaxset: Memory allocators

W
warning: Warnings and errors
warningcall: Warnings and errors

+
Jump to:   . +   +_ +   +
+A +   +C +   +D +   +E +   +F +   +G +   +I +   +L +   +M +   +N +   +P +   +R +   +S +   +T +   +U +   +V +   +W +   +
+ +
+ +
+

+Previous: , Up: Top   [Contents][Index]

+
+ +

Concept index

+ +
Jump to:   . +   +
+A +   +B +   +C +   +E +   +F +   +G +   +L +   +M +   +N +   +P +   +S +   +U +   +V +   +W +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

.
... argument: Rest of header
... argument: Dot-dot-dot arguments
.Internal function: Argument evaluation

A
allocation classes: Allocation classes
argument evaluation: Argument evaluation
argument list: SEXPTYPEs
atomic vector type: SEXPTYPEs
attributes: Attributes
attributes, preserving: Attributes
autoprinting: Autoprinting

B
base environment: Environments and variable lookup
base environment: Base environment
base namespace: Namespaces
builtin function: Argument evaluation

C
coding standards: R coding standards
context: Contexts
copying semantics: Rest of header
copying semantics: Attributes

E
environment: Environments and variable lookup
environment, base: Environments and variable lookup
environment, base: Base environment
environment, global: Global environment
expression: SEXPTYPEs

F
function: SEXPTYPEs

G
garbage collector: The write barrier
generic, generic: Argument evaluation
generic, internal: Argument evaluation
global environment: Global environment

L
language object: SEXPTYPEs

M
method dispatch: Contexts
missingness: Missingness
modules: Modules

N
namespace: Namespaces
namespace, base: Namespaces
node: SEXPs

P
preserving attributes: Attributes
primitive function: Argument evaluation
promise: Rest of header

S
S4 type: SEXPTYPEs
search path: Search paths
serialization: Serialization Formats
SEXP: SEXPs
SEXPRREC: SEXPs
SEXPTYPE: SEXPTYPEs
SEXPTYPE table: SEXPTYPEs
special function: Argument evaluation

U
user databases: Environments and variable lookup

V
variable lookup: Environments and variable lookup
vector type: The 'data'
visibility: Visibility

W
write barrier: The write barrier

+
Jump to:   . +   +
+A +   +B +   +C +   +E +   +F +   +G +   +L +   +M +   +N +   +P +   +S +   +U +   +V +   +W +   +
+ +
+
+

Footnotes

+ +

(1)

+

strictly, a SEXPREC +node; VECTOR_SEXPREC nodes are slightly smaller but followed by +data in the node.

+

(2)

+

a pointer to a function or a symbol to look up the +function by name, or a language object to be evaluated to give a +function.

+

(3)

+

This is almost unused. The only +current use is for hash tables of environments (VECSXPs), where +length is the size of the table and truelength is the +number of primary slots in use, and for the reference hash tables in +serialization (VECSXPs), where truelength is the number of +slots in use.

+

(4)

+

Remember that attaching a list or +a saved image actually creates and populates an environment and attaches +that.

+

(5)

+

There is currently one other +difference: when profiling builtin functions are counted as function +calls but specials are not.

+

(6)

+

the other current example +is left brace, which is implemented as a primitive.

+

(7)

+

only bits 0:4 are currently used +for SEXPTYPEs but values 241:255 are used for +pseudo-SEXPTYPEs.

+

(8)

+

Currently the only relevant bits are 0:1, 4, 14:15.

+

(9)

+

See define +USE_UTF8_IF_POSSIBLE in file src/main/gram.c.

+

(10)

+

or UTF-16 if support for surrogates is enabled in the OS, +which it is not normally so at least for Western versions of Windows, +despite some claims to the contrary on the Microsoft website.

+

(11)

+

but not the +GraphApp toolkit.

+

(12)

+

This can also create +non-S4 objects, as in new("integer").

+

(13)

+

although this is +not recommended as it is less future-proof.

+

(14)

+

but apparently not on Windows.

+

(15)

+

The C code is in files +base.c, graphics.c, par.c, plot.c and +plot3d.c in directory src/main.

+

(16)

+

although that needs to be +handled carefully, as for example the circle callback is given a +radius (and that should be interpreted as in the x units).

+

(17)

+

It is +possible for the device to find the GEDevDesc which points to its +DevDesc, and this is done often enough that there is a +convenience function desc2GEDesc to do so.

+

(18)

+

Calling +R_CheckDeviceAvailable() ensures there is a free slot or throws +an error.

+

(19)

+

in device coordinates

+

(20)

+

It is technically possible to use alpha-blending on +metafile devices such as printers, but it seems few drivers have support +for this.

+

(21)

+

an Xcode project, in SVN at +https://svn.r-project.org/R-packages/trunk/Mac-GUI.

+

(22)

+

under Windows, junction points, or copies if +environment variable R_WIN_NO_JUNCTIONS has a non-empty value.

+

(23)

+

see the previous footnote.

+

(24)

+

Linux +distributions tend to unbundle texinfo.tex from ‘texinfo’.

+

(25)

+

but LENGTH is a macro under some internal +uses.

+
+
+ + + + + diff --git a/R-lang.html b/R-lang.html new file mode 100644 index 0000000..f72546c --- /dev/null +++ b/R-lang.html @@ -0,0 +1,6228 @@ + + + + + +R Language Definition + + + + + + + + + + + + + + + + +

R Language Definition

+ + + + + + + + + + + + + + + + + + + + + + + +

Table of Contents

+ +
+ + +
+ + + +
+

+Next:   [Contents][Index]

+
+ +

R Language Definition

+ +

This is an introduction to the R language, explaining evaluation, +parsing, object oriented programming, computing on the language, and so +forth. +

+

This manual is for R, version 3.2.3 (2015-12-10). +

+

Copyright © 2000–2015 R Core Team +

+
+

Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. +

+

Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided that the +entire resulting derived work is distributed under the terms of a +permission notice identical to this one. +

+

Permission is granted to copy and distribute translations of this manual +into another language, under the above conditions for modified versions, +except that this permission notice may be stated in a translation +approved by the R Core Team. +

+ + + + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

1 Introduction

+ +

R is a system for statistical computation and graphics. It +provides, among other things, a programming language, high level +graphics, interfaces to other languages and debugging facilities. This +manual details and defines the R language. +

+

The R language is a dialect of S which was designed in the 1980s +and has been in widespread use in the statistical community since. +Its principal designer, John M. Chambers, was awarded the 1998 ACM +Software Systems Award for S. +

+

The language syntax has a superficial similarity with C, but the +semantics are of the FPL (functional programming language) variety with +stronger affinities with Lisp and APL. In particular, it +allows “computing on the language”, which in turn makes it possible to +write functions that take expressions as input, something that is often +useful for statistical modeling and graphics. +

+

It is possible to get quite far using R interactively, executing + +simple expressions from the command line. Some users may never need to +go beyond that level, others will want to write their own functions +either in an ad hoc fashion to systematize repetitive work or with the +perspective of writing add-on packages for new functionality. +

+

The purpose of this manual is to document the language per se. +That is, the objects that it works on, and the details of the expression +evaluation process, which are useful to know when programming R +functions. Major subsystems for specific tasks, such as graphics, are +only briefly described in this manual and will be documented separately. +

+

Although much of the text will equally apply to S, there are also +some substantial differences, and in order not to confuse the issue we +shall concentrate on describing R. +

+

The design of the language contains a number of fine points and +common pitfalls which may surprise the user. Most of these are due to +consistency considerations at a deeper level, as we shall explain. +There are also a number of useful shortcuts and idioms, which allow the +user to express quite complicated operations succinctly. Many of these +become natural once one is familiar with the underlying concepts. In +some cases, there are multiple ways of performing a task, but some of +the techniques will rely on the language implementation, and others work +at a higher level of abstraction. In such cases we shall indicate the +preferred usage. +

+

Some familiarity with R is assumed. This is not an introduction to +R but rather a programmers’ reference manual. Other manuals provide +complementary information: in particular Preface in An +Introduction to R provides an introduction to R and System and +foreign language interfaces in Writing R Extensions details +how to extend R using compiled code. +

+ + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

2 Objects

+ +

In every computer language + +variables provide a means of accessing the data stored in memory. R +does not provide direct access to the computer’s memory but rather +provides a number of specialized data structures we will refer to as + +objects. These objects +are referred to through symbols or variables. In R, however, the +symbols are themselves objects and can be manipulated in the same way as +any other object. This is different from many other languages and has +wide ranging effects. +

+

In this chapter we provide preliminary descriptions of the various data +structures provided in R. More detailed discussions of many of them +will be found in the subsequent chapters. The R specific function +typeof + + +returns the type of an R object. Note that in the C code +underlying R, all objects are pointers to a structure with typedef +SEXPREC; the different R data types are represented in C by +SEXPTYPE, which determines how the information in the various +parts of the structure is used. +

+

The following table describes the possible values returned by +typeof and what they are. +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + +
"NULL"NULL
"symbol"a variable name
"pairlist"a pairlist object (mainly internal)
"closure"a function
"environment"an environment +
"promise"an object used to implement lazy evaluation
"language"an R language construct
"special"an internal function that does not evaluate its arguments
"builtin"an internal function that evaluates its arguments
"char"a ‘scalar’ string object (internal only) ***
"logical"a vector containing logical values
"integer"a vector containing integer values
"double"a vector containing real values
"complex"a vector containing complex values
"character"a vector containing character values
"..."the special variable length argument ***
"any"a special type that matches all types: there are no objects of this type
"expression"an expression object
"list"a list
"bytecode"byte code (internal only) ***
"externalptr"an external pointer object
"weakref"a weak reference object
"raw"a vector containing bytes
"S4"an S4 object which is not a simple object
+
+ +

Users cannot easily get hold of objects of types marked with a ‘***’. +

+ + + +

Function mode gives information about the mode of an object +in the sense of Becker, Chambers & Wilks (1988), and is more compatible +with other implementations of the S language. + +Finally, the function storage.mode returns the storage mode +of its argument in the sense of Becker et al. (1988). It is generally +used when calling functions written in another language, such as C or +FORTRAN, to ensure that R objects have the data type expected by the +routine being called. (In the S language, vectors with integer or +real values are both of mode "numeric", so their storage modes +need to be distinguished.) +

+
+
> x <- 1:3
+> typeof(x)
+[1] "integer"
+> mode(x)
+[1] "numeric"
+> storage.mode(x)
+[1] "integer"
+
+ +

R + +objects are often coerced to different + +types during computations. +There are also many functions available to perform explicit + +coercion. +When programming in the R language the type of an object generally +doesn’t affect the computations, however, when dealing with foreign +languages or the operating system it is often necessary to ensure that +an object is of the correct type. +

+ + + + + + +
+ +
+

+Next: , Previous: , Up: Objects   [Contents][Index]

+
+ + +

2.1 Basic types

+ + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.1 Vectors

+ + +

Vectors can be thought of as contiguous cells containing data. Cells +are accessed through + +indexing operations such as +x[5]. More details are given in Indexing. +

+ + + +

R has six basic (‘atomic’) vector types: logical, integer, real, +complex, string (or character) and raw. The modes and storage modes for +the different vector types are listed in the following table. +

+
+ + + + + + + + +
typeofmodestorage.mode
logicallogicallogical
integernumericinteger
doublenumericdouble
complexcomplexcomplex
charactercharactercharacter
rawrawraw
+
+ +

Single numbers, such as 4.2, and strings, such as "four +point two" are still vectors, of length 1; there are no more basic +types. Vectors with length zero are possible (and useful). +

+

String vectors have mode and storage mode "character". A single +element of a character vector is often referred to as a character +string. +

+ +
+ +
+

+Next: , Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.2 Lists

+ +

Lists (“generic vectors”) are another kind of data storage. Lists +have elements, each of which can contain any type of R object, i.e. +the elements of a list do not have to be of the same type. List +elements are accessed through three different + +indexing operations. +These are explained in detail in Indexing. +

+

Lists are vectors, and the basic vector types are referred to as +atomic vectors where it is necessary to exclude lists. +

+
+ +
+

+Next: , Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.3 Language objects

+ +

There are three types of objects that constitute the R language. +They are calls, expressions, and names. + + + +Since R has objects of type "expression" we will try to avoid +the use of the word expression in other contexts. In particular +syntactically correct expressions will be referred to as +statements. + +

+

These objects have modes "call", "expression", and +"name", respectively. +

+

They can be created directly from expressions using the quote +mechanism and converted to and from lists by the as.list and +as.call functions. + + + +Components of the + +parse tree can be extracted using the standard +indexing operations. +

+ + + + +
+ +
+

+Previous: , Up: Language objects   [Contents][Index]

+
+ +

2.1.3.1 Symbol objects

+ + + +

Symbols refer to R + +objects. The + +name of any R object is usually a +symbol. Symbols can be created through the functions as.name and +quote. +

+ + +

Symbols have mode "name", storage mode "symbol", and type +"symbol". They can be + +coerced to and from character strings +using as.character and as.name. + + + +They naturally appear as atoms of parsed expressions, try e.g. +as.list(quote(x + y)). +

+
+ +
+

+Next: , Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.4 Expression objects

+ +

In R one can have objects of type "expression". An +expression contains one or more statements. A statement is a +syntactically correct collection of + +tokens. + +Expression objects are special language objects which contain parsed but +unevaluated R statements. The main difference is that an expression +object can contain several such expressions. Another more subtle +difference is that objects of type "expression" are only + +evaluated when +explicitly passed to eval, whereas other language objects may get +evaluated in some unexpected cases. +

+

An + +expression object behaves much like a list and its components should +be accessed in the same way as the components of a list. +

+
+ +
+

+Next: , Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.5 Function objects

+ + +

In R functions are objects and can be manipulated in much the same +way as any other object. Functions (or more precisely, function +closures) have three basic components: a formal argument list, a body +and an + +environment. The argument list is a comma-separated list of +arguments. An + +argument can be a symbol, or a ‘symbol = +default’ construct, or the special argument ‘...’. The +second form of argument is used to specify a default value for an +argument. This value will be used if the function is called without any +value specified for that argument. The ‘...’ argument is special +and can contain any number of arguments. It is generally used if the +number of arguments is unknown or in cases where the arguments will be +passed on to another function. +

+

The body is a parsed R statement. It is usually a collection of +statements in braces but it can be a single statement, a symbol or even +a constant. +

+

A function’s + + +environment is the environment that was active at the time +that the function was created. Any symbols bound in that environment +are captured and available to the function. This combination of +the code of the function and the bindings in its environment is called a +‘function closure’, a term from functional programming theory. In this +document we generally use the term ‘function’, but use ‘closure’ to +emphasize the importance of the attached environment. +

+

It is possible to extract and manipulate the three parts of a closure +object using formals, body, and environment +constructs (all three can also be used on the left hand side of + +assignments). + + + +The last of these can be used to remove unwanted environment capture. +

+

When a function is called, a new environment (called the +evaluation environment) is created, whose enclosure (see +Environment objects) is the environment from the function closure. +This new environment is initially populated with the unevaluated +arguments to the function; as evaluation proceeds, local variables are +created within it. +

+ +

There is also a facility for converting functions to and from list +structures using as.list and as.function. + +These have been included to provide compatibility with S and their +use is discouraged. +

+
+ + + +

2.1.6 NULL

+ +

There is a special object called NULL. It is used whenever there +is a need to indicate or specify that an object is absent. It should not be +confused with a vector or list of zero length. + +

+

The NULL object has no type and no modifiable properties. There +is only one NULL object in R, to which all instances refer. To +test for NULL use is.null. You cannot set attributes on +NULL. +

+ +
+ +
+

+Next: , Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.7 Builtin objects and special forms

+ +

These two kinds of object contain the builtin + + + +functions of R, i.e., those that are displayed as .Primitive +in code listings (as well as those accessed via the .Internal +function and hence not user-visible as objects). The difference between +the two lies in the argument handling. Builtin functions have all +their arguments evaluated and passed to the internal function, in +accordance with call-by-value, whereas special functions pass the +unevaluated arguments to the internal function. +

+

From the R language, these objects are just another kind of function. +The is.primitive function can distinguish them from interpreted + +functions. +

+
+ + + +

2.1.8 Promise objects

+ + +

Promise objects are part of R’s lazy evaluation mechanism. They +contain three slots: a value, an expression, and an + +environment. When a + + +function is called the arguments are matched and then each of the formal +arguments is bound to a promise. The expression that was given for that +formal argument and a pointer to the environment the function was called +from are stored in the promise. +

+

Until that argument is accessed there is no value associated with +the promise. When the argument is accessed, the stored expression is + +evaluated in the stored environment, and the result is returned. The +result is also saved by +the promise. The substitute function will extract the content +of the expression slot. This allows the programmer to +access either the value or the expression associated with the promise. +

+

Within the R language, promise objects are almost only seen +implicitly: actual function arguments are of this type. There is also a +delayedAssign function that will make a promise out of an +expression. There is generally no way in R code to check whether an +object is a promise or not, nor is there a way to use R code to +determine the environment of a promise. +

+
+ + + +

2.1.9 Dot-dot-dot

+ +

The ‘...’ object type is stored as a type of pairlist. The +components of ‘...’ can be accessed in the usual pairlist manner +from C code, but is not easily accessed as an object in interpreted +code. The object can be captured as a list, so for example in +table one sees +

+
+
    args <- list(...)
+## ....
+    for (a in args) {
+## ....
+
+ + + +

If a function has ‘...’ as a formal argument then any actual +arguments that do not match a formal argument are matched with +‘...’. +

+
+ +
+

+Next: , Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.10 Environments

+ + +

Environments can be thought of as consisting of two things. A +frame, consisting of a set of symbol-value pairs, and an +enclosure, a pointer to an enclosing environment. When R +looks up the value for a symbol the frame is examined and if a +matching symbol is found its value will be returned. If not, the +enclosing environment is then accessed and the process repeated. +Environments form a tree structure in which the enclosures play the +role of parents. The tree of environments is rooted in an empty + +environment, available through emptyenv(), which has no parent. +It is the direct parent of the environment of the base package + +(available through the baseenv() function). Formerly +baseenv() had the special value NULL, but as from +version 2.4.0, the use of NULL as an environment is defunct. +

+

Environments are created implicitly by function calls, as described in +Function objects and Lexical environment. In this case the +environment contains the variables local to the function (including the +arguments), and its enclosure is the environment of the currently called +function. Environments may also be created directly by new.env. + +The frame content of an environment can be accessed and manipulated by +use of ls, get and assign as well as eval and +evalq. +

+

The parent.env function may be used to access the enclosure of +an environment. +

+

Unlike most other R objects, environments are not copied when passed +to functions or used in assignments. Thus, if you assign the same +environment to several symbols and change one, the others will change +too. In particular, assigning attributes to an environment can lead to +surprises. +

+
+ +
+

+Next: , Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.11 Pairlist objects

+ +

Pairlist objects are similar to Lisp’s dotted-pair lists. They are used +extensively in the internals of R, but are rarely visible in +interpreted code, although they are returned by formals, and can +be created by (e.g.) the pairlist function. A zero-length +pairlist is NULL, as would be expected in Lisp but in contrast to +a zero-length list. + +Each such object has three slots, a CAR value, a CDR value and a TAG +value. The TAG value is a text string and CAR and CDR usually +represent, respectively, a list item (head) and the remainder (tail) of +the list with a NULL object as terminator (the CAR/CDR terminology is +traditional Lisp and originally referred to the address and decrement +registers on an early 60’s IBM computer). +

+

Pairlists are handled in the R language in exactly the same way as +generic vectors (“lists”). In particular, elements are accessed using +the same [[]] syntax. The use of pairlists is deprecated since +generic vectors are usually more efficient to use. When an internal +pairlist is accessed from R it is generally (including when +subsetted) converted to a generic vector. +

+

In a very few cases pairlists are user-visible: one is .Options. +

+
+ +
+

+Previous: , Up: Basic types   [Contents][Index]

+
+ +

2.1.12 The “Any” type

+ +

It is not really possible for an object to be of “Any” type, but it is +nevertheless a valid type value. It gets used in certain (rather rare) +circumstances, e.g. as.vector(x, "any"), indicating that type + +coercion should not be done. +

+ +
+ +
+

+Next: , Previous: , Up: Objects   [Contents][Index]

+
+ +

2.2 Attributes

+ + + +

All objects except NULL can have one or more attributes attached +to them. Attributes are stored as a pairlist where all elements are +named, but should be thought of as a set of name=value pairs. A listing +of the attributes can be obtained using attributes and set by +attributes<-, + + +individual components are accessed using attr and attr<-. + + +

+

Some attributes have special accessor + +functions (e.g. levels<- +for factors) and these should be used when available. In addition to +hiding details of implementation they may perform additional operations. +R attempts to intercept calls to attr<- and to +attributes<- that involve the special attributes and enforces +the consistency checks. +

+

Matrices and arrays are simply vectors with the attribute dim and +optionally dimnames attached to the vector. +

+

Attributes are used to implement the class structure used in R. If an +object has a class attribute then that attribute will be examined +during + +evaluation. The class structure in R is described in detail +in Object-oriented programming. +

+ + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Attributes   [Contents][Index]

+
+ +

2.2.1 Names

+ +

A names attribute, when present, labels the individual elements of +a vector or list. When an object is printed the names attribute, +when present, is used to label the elements. The names attribute +can also be used for indexing purposes, for example, +quantile(x)["25%"]. +

+

One may get and set the names using names and names<- +constructions. + + + +The latter will perform the necessary consistency checks to ensure that +the names attribute has the proper type and length. +

+

Pairlists and one-dimensional arrays are treated specially. For pairlist +objects, a virtual names attribute is used; the names +attribute is really constructed from the tags of the list components. +For one-dimensional arrays the names attribute really accesses +dimnames[[1]]. +

+
+ +
+

+Next: , Previous: , Up: Attributes   [Contents][Index]

+
+ +

2.2.2 Dimensions

+ +

The dim attribute is used to implement arrays. The content of +the array is stored in a vector in column-major order and the dim +attribute is a vector of integers specifying the respective extents of +the array. R ensures that the length of the vector is the product of +the lengths of the dimensions. The length of one or more dimensions may +be zero. +

+ +

A vector is not the same as a one-dimensional array since the latter has +a dim attribute of length one, whereas the former has no +dim attribute. +

+
+ +
+

+Next: , Previous: , Up: Attributes   [Contents][Index]

+
+ +

2.2.3 Dimnames

+ +

Arrays may name each dimension separately using the dimnames +attribute which is a list of character vectors. The dimnames +list may itself have names which are then used for extent headings when +printing arrays. +

+
+ +
+

+Next: , Previous: , Up: Attributes   [Contents][Index]

+
+ +

2.2.4 Classes

+ +

R has an elaborate class system1, principally controlled via +the class attribute. This attribute is a character vector +containing the list of classes that an object inherits from. This forms +the basis of the “generic methods” functionality in R. +

+

This attribute can be accessed and manipulated virtually without +restriction by users. There is no checking that an object actually +contains the components that class methods expect. Thus, altering the +class attribute should be done with caution, and when they are +available specific creation and + +coercion functions should be preferred. +

+
+ +
+

+Next: , Previous: , Up: Attributes   [Contents][Index]

+
+ +

2.2.5 Time series attributes

+ +

The tsp attribute is used to hold parameters of time series, +start, end, and frequency. This construction is mainly used to handle +series with periodic substructure such as monthly or quarterly data. +

+
+ +
+

+Previous: , Up: Attributes   [Contents][Index]

+
+ +

2.2.6 Copying of attributes

+ +

Whether attributes should be copied when an object is altered is a +complex area, but there are some general rules (Becker, Chambers & +Wilks, 1988, pp. 144–6). +

+

Scalar functions (those which operate element-by-element on a vector and +whose output is similar to the input) should preserve attributes (except +perhaps class). +

+

Binary operations normally copy most attributes from the longer argument +(and if they are of the same length from both, preferring the values on +the first). Here ‘most’ means all except the names, dim +and dimnames which are set appropriately by the code for the +operator. +

+

Subsetting (other than by an empty index) generally drops all attributes +except names, dim and dimnames which are reset as +appropriate. On the other hand, subassignment generally preserves +attributes even if the length is changed. Coercion drops all +attributes. +

+

The default method for sorting drops all attributes except names, which +are sorted along with the object. +

+ +
+ +
+

+Previous: , Up: Objects   [Contents][Index]

+
+ +

2.3 Special compound objects

+ + + + + + +
+ + + +

2.3.1 Factors

+ +

Factors are used to describe items that can have a finite number of +values (gender, social class, etc.). A factor has a levels +attribute and class "factor". Optionally, it may also contain a +contrasts attribute which controls the parametrisation used when +the factor is used in a + + +modeling functions. +

+

A factor may be purely nominal or may have ordered categories. In the +latter case, it should be defined as such and have a class vector +c("ordered"," factor"). +

+

Factors are currently implemented using an integer array to specify the +actual levels and a second array of names that are mapped to the +integers. Rather unfortunately users often make use of the +implementation in order to make some calculations easier. This, +however, is an implementation issue and is not guaranteed to hold in all +implementations of R. +

+
+ +
+

+Previous: , Up: Special compound objects   [Contents][Index]

+
+ +

2.3.2 Data frame objects

+ +

Data frames are the R structures which most closely mimic the SAS or +SPSS data set, i.e. a “cases by variables” matrix of data. +

+

A data frame is a list of vectors, factors, and/or matrices all having +the same length (number of rows in the case of matrices). In addition, +a data frame generally has a names attribute labeling the +variables and a row.names attribute for labeling the cases. +

+

A data frame can contain a list that is the same length as the other +components. The list can contain elements of differing lengths thereby +providing a data structure for ragged arrays. However, as of this +writing such arrays are not generally handled correctly. +

+ + + + + + + + + + + + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

3 Evaluation of expressions

+ +

When a user types a command at the prompt (or when an expression is read +from a file) the first thing that happens to it is that the command is +transformed by the + +parser into an internal representation. The +evaluator executes parsed R expressions and returns the value of the +expression. All expressions have a value. This is the core of the +language. +

+

This chapter describes the basic mechanisms of the evaluator, but avoids +discussion of specific functions or groups of functions which are +described in separate chapters later on or where the help pages should +be sufficient documentation. +

+

Users can construct expressions and invoke the evaluator on them. +

+ + + + + + + + +
+ + + +

3.1 Simple evaluation

+ + + + + + + + +
+ + + +

3.1.1 Constants

+ +

Any number typed directly at the prompt is a constant and is evaluated. +

+
+
> 1
+[1] 1
+
+ +

Perhaps unexpectedly, the number returned from the expression 1 +is a numeric. In most cases, the difference between an integer and a +numeric value will be unimportant as R will do the right thing when +using the numbers. There are, however, times when we would like to +explicitly create an integer value for a constant. We can do this by +calling the function as.integer or using various other +techniques. But perhaps the simplest approach is to qualify our +constant with the suffix character ‘L’. +For example, to create the integer value 1, we might use +

+
+
> 1L
+[1]
+
+ +

We can use the ‘L’ suffix to qualify any number with the intent of +making it an explicit integer. So ‘0x10L’ creates the integer value +16 from the hexadecimal representation. The constant 1e3L gives 1000 +as an integer rather than a numeric value and is equivalent to 1000L. +(Note that the ‘L’ is treated as qualifying the term 1e3 and not the +3.) If we qualify a value with ‘L’ that is not an integer value, +e.g. 1e-3L, we get a warning and the numeric value is created. +A warning is also created if there is an unnecessary decimal point +in the number, e.g. 1.L. +

+

We get a syntax error when using ‘L’ with complex numbers, +e.g. 12iL gives an error. +

+

Constants are fairly boring and to do more we need symbols. +

+
+ +
+

+Next: , Previous: , Up: Simple evaluation   [Contents][Index]

+
+ +

3.1.2 Symbol lookup

+ +

When a new variable is created it must have a + +name so it can be referenced and it usually has a value. The name itself is a + +symbol. +When a symbol is + +evaluated its + +value is returned. Later we shall +explain in detail how to determine the value associated with a symbol. +

+

In this small example y is a symbol and its value is 4. A symbol +is an R object too, but one rarely needs to deal with symbols +directly, except when doing “programming on the language” +(Computing on the language). +

+
+
> y <- 4
+> y
+[1] 4
+
+ + + + + + +
+ +
+

+Next: , Previous: , Up: Simple evaluation   [Contents][Index]

+
+ +

3.1.3 Function calls

+ +

Most of the computations carried out in R involve the evaluation of +functions. We will also refer to this as + +function invocation. +Functions are invoked by name with a list of arguments separated by +commas. +

+
+
> mean(1:10)
+[1] 5.5
+
+ +

In this example the function mean was called with one argument, +the vector of integers from 1 to 10. +

+

R contains a huge number of functions with different purposes. Most +are used for producing a result which is an R object, but others are +used for their side effects, e.g., printing and plotting functions. +

+ + +

Function calls can have tagged (or named) arguments, as in +plot(x, y, pch = 3). Arguments without tags are known as +positional since the function must distinguish their meaning from +their sequential positions among the arguments of the call, e.g., that +x denotes the abscissa variable and y the ordinate. The +use of tags/names is an obvious convenience for functions with a large +number of optional arguments. +

+ +

A special type of function calls can appear on the left hand side of +the + +assignment operator as in +

+
+
> class(x) <- "foo"
+
+ +

What this construction really does is to call the function +class<- with the original object and the right hand side. This +function performs the modification of the object and returns the result +which is then stored back into the original variable. (At least +conceptually, this is what happens. Some additional effort is made to +avoid unnecessary data duplication.) +

+ +
+ +
+

+Previous: , Up: Simple evaluation   [Contents][Index]

+
+ +

3.1.4 Operators

+ +

R allows the use of arithmetic expressions using operators similar to +those of the C programming language, for instance +

+
+
> 1 + 2
+[1] 3
+
+ +

Expressions can be grouped using parentheses, mixed with function calls, +and assigned to variables in a straightforward manner +

+
+
> y <- 2 * (a + log(x))
+
+ +

R contains a number of operators. They are listed in the table +below. +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
-Minus, can be unary or binary
+Plus, can be unary or binary
!Unary not
~Tilde, used for model formulae, can be either unary or binary
?Help
:Sequence, binary (in model formulae: interaction)
*Multiplication, binary
/Division, binary
^Exponentiation, binary
%x%Special binary operators, x can be replaced by any valid name
%%Modulus, binary
%/%Integer divide, binary
%*%Matrix product, binary
%o%Outer product, binary
%x%Kronecker product, binary
%in%Matching operator, binary (in model formulae: nesting)
<Less than, binary
>Greater than, binary
==Equal to, binary
>=Greater than or equal to, binary
<=Less than or equal to, binary
&And, binary, vectorized
&&And, binary, not vectorized
|Or, binary, vectorized
||Or, binary, not vectorized
<-Left assignment, binary
->Right assignment, binary
$List subset, binary
+
+ +

Except for the syntax, there is no difference between applying an +operator and calling a function. In fact, x + y can equivalently +be written `+`(x, y). Notice that since ‘+’ is a +non-standard function name, it needs to be quoted. +

+ +

R deals with entire vectors of data at a time, and most of the +elementary operators and basic mathematical functions like log +are vectorized (as indicated in the table above). This means that +e.g. adding two vectors of the same length will create a vector +containing the element-wise sums, implicitly looping over the vector +index. This applies also to other operators like -, *, +and / as well as to higher dimensional structures. Notice in +particular that multiplying two matrices does not produce the usual +matrix product (the %*% operator exists for that purpose). Some +finer points relating to vectorized operations will be discussed in +Elementary arithmetic operations. +

+

To access individual elements of an atomic vector, one generally uses +the x[i] construction. +

+
+
> x <- rnorm(5)
+> x
+[1] -0.12526937 -0.27961154 -1.03718717 -0.08156527  1.37167090
+> x[2]
+[1] -0.2796115
+
+ +

List components are more commonly accessed using x$a or +x[[i]]. +

+
+
> x <- options()
+> x$prompt
+[1] "> "
+
+ +

Indexing constructs can also appear on the right hand side of an + +assignment. +

+

Like the other operators, indexing is really done by functions, and one +could have used `[`(x, 2) instead of x[2]. +

+

R’s indexing operations contain many advanced features which are +further described in Indexing. +

+
+ + + +

3.2 Control structures

+ +

Computation in R consists of sequentially evaluating +statements. Statements, such as x<-1:10 or +mean(y), can be separated by either a semi-colon or a new line. +Whenever the + +evaluator is presented with a syntactically complete +statement that statement is evaluated and the value returned. +The result of evaluating a statement can be referred to as the value of +the statement2 The value can +always be assigned to a symbol. +

+

Both semicolons and new lines can be used to separate statements. A +semicolon always indicates the end of a statement while a new line +may indicate the end of a statement. If the current statement is +not syntactically complete new lines are simply ignored by the +evaluator. If the session is interactive the prompt changes from +‘>’ to ‘+’. +

+
+
> x <- 0; x + 5
+[1] 5
+> y <- 1:10
+> 1; 2
+[1] 1
+[1] 2
+
+ +

Statements can be grouped together using braces ‘{’ and ‘}’. +A group of statements is sometimes called a block. Single +statements are evaluated when a new line is typed at the end of the +syntactically complete statement. Blocks are not evaluated until a new +line is entered after the closing brace. In the remainder of this +section, statement refers to either a single statement or a +block. +

+
+
> { x <- 0
++ x + 5
++ }
+[1] 5
+
+ + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Control structures   [Contents][Index]

+
+ +

3.2.1 if

+ +

The if/else statement conditionally evaluates two +statements. There is a condition which is evaluated and if the +value is TRUE then the first statement is evaluated; +otherwise the second statement will be evaluated. The +if/else statement returns, as its value, the value of the +statement that was selected. The formal syntax is +

+
+
if ( statement1 )
+    statement2
+else
+    statement3
+
+ +

First, statement1 is evaluated to yield value1. If +value1 is a logical vector with first element TRUE then +statement2 is evaluated. If the first element of value1 is +FALSE then statement3 is evaluated. If value1 is a +numeric vector then statement3 is evaluated when the first element +of value1 is zero and otherwise statement2 is evaluated. +Only the first element of value1 is used. All other elements are +ignored. If value1 has any type other than a logical or a numeric +vector an error is signalled. +

+

if/else statements can be used to avoid numeric problems +such as taking the logarithm of a negative number. Because +if/else statements are the same as other statements you +can assign the value of them. The two examples below are equivalent. +

+
+
> if( any(x <= 0) ) y <- log(1+x) else y <- log(x)
+> y <- if( any(x <= 0) ) log(1+x) else log(x)
+
+ +

The else clause is optional. The statement if(any(x <= 0)) +x <- x[x <= 0] is valid. When the if statement is not in a +block the else, if present, must appear on the same line as +the end of statement2. Otherwise the new line at the end of +statement2 completes the if and yields a syntactically +complete statement that is evaluated. A simple solution is to use a +compound statement wrapped in braces, putting the else on the +same line as the closing brace that marks the end of the statement. +

+

if/else statements can be nested. +

+
+
if ( statement1 ) {
+    statement2
+} else if ( statement3 ) {
+    statement4
+} else if ( statement5 ) {
+    statement6
+} else
+    statement8
+
+ +

One of the even numbered statements will be evaluated and the resulting +value returned. If the optional else clause is omitted and all +the odd numbered statements evaluate to FALSE no statement +will be evaluated and NULL is returned. +

+

The odd numbered statements are evaluated, in order, until one +evaluates to TRUE and then the associated even numbered +statement is evaluated. In this example, statement6 will +only be evaluated if statement1 is FALSE and +statement3 is FALSE and statement5 is TRUE. +There is no limit to the number of else if clauses that are +permitted. +

+
+ +
+

+Next: , Previous: , Up: Control structures   [Contents][Index]

+
+ +

3.2.2 Looping

+ +

R has three statements that provide explicit +looping.3 They are for, while and +repeat. The two built-in constructs, next and +break, provide additional control over the evaluation. Each of +the three statements returns the value of the last statement that was +evaluated. It is possible, although uncommon, to assign the result of +one of these statements to a symbol. R provides other functions for +implicit looping such as tapply, apply, and lapply. +In addition many operations, especially arithmetic ones, are vectorized +so you may not need to use a loop. +

+

There are two statements that can be used to explicitly control looping. +They are break and next. + + +The break statement causes an exit from the innermost loop that +is currently being executed. The next statement immediately +causes control to return to the start of the loop. The next iteration +of the loop (if there is one) is then executed. No statement below +next in the current loop is evaluated. +

+

The value returned by a loop statement is always NULL +and is returned invisibly. +

+
+ +
+

+Next: , Previous: , Up: Control structures   [Contents][Index]

+
+ +

3.2.3 repeat

+ + +

The repeat statement causes repeated evaluation of the body until +a break is specifically requested. This means that you need to be +careful when using repeat because of the danger of an infinite +loop. The syntax of the repeat loop is +

+
+
repeat statement
+
+ +

When using repeat, statement must be a block statement. +You need to both perform some computation and test whether or not to +break from the loop and usually this requires two statements. +

+
+ +
+

+Next: , Previous: , Up: Control structures   [Contents][Index]

+
+ +

3.2.4 while

+ + +

The while statement is very similar to the repeat +statement. The syntax of the while loop is +

+
+
while ( statement1 ) statement2
+
+ +

where statement1 is evaluated and if its value is TRUE then +statement2 is evaluated. This process continues until +statement1 evaluates to FALSE. +

+
+ +
+

+Next: , Previous: , Up: Control structures   [Contents][Index]

+
+ +

3.2.5 for

+ + +

The syntax of the for loop is +

+
+
for ( name in vector )
+   statement1
+
+ +

where vector can be either a vector or a list. For each element +in vector the variable name is set to the value of that +element and statement1 is evaluated. A side effect is that the +variable name still exists after the loop has concluded and it has +the value of the last element of vector that the loop was +evaluated for. +

+
+ +
+

+Previous: , Up: Control structures   [Contents][Index]

+
+ +

3.2.6 switch

+ + +

Technically speaking, switch is just another function, but its +semantics are close to those of control structures of other programming +languages. +

+

The syntax is +

+
+
switch (statement, list)
+
+ +

where the elements of list may be named. First, statement +is evaluated and the result, value, obtained. If value is a +number between 1 and the length of list then the corresponding +element of list is evaluated and the result returned. If value +is too large or too small NULL is returned. +

+
+
> x <- 3
+> switch(x, 2+2, mean(1:10), rnorm(5))
+[1]  2.2903605  2.3271663 -0.7060073  1.3622045 -0.2892720
+> switch(2, 2+2, mean(1:10), rnorm(5))
+[1] 5.5
+> switch(6, 2+2, mean(1:10), rnorm(5))
+NULL
+
+ +

If value is a character vector then the element of ‘...’ with +a name that exactly matches value is evaluated. If there is no +match a single unnamed argument will be used as a default. If no +default is specified, NULL is returned. +

+
+
> y <- "fruit"
+> switch(y, fruit = "banana", vegetable = "broccoli", "Neither")
+[1] "banana"
+> y <- "meat"
+> switch(y, fruit = "banana", vegetable = "broccoli", "Neither")
+[1] "Neither"
+
+ +

A common use of switch is to branch according to the character +value of one of the arguments to a function. +

+
+
> centre <- function(x, type) {
++ switch(type,
++        mean = mean(x),
++        median = median(x),
++        trimmed = mean(x, trim = .1))
++ }
+> x <- rcauchy(10)
+> centre(x, "mean")
+[1] 0.8760325
+> centre(x, "median")
+[1] 0.5360891
+> centre(x, "trimmed")
+[1] 0.6086504
+
+ +

switch returns either the value of the statement that was +evaluated or NULL if no statement was evaluated. +

+

To choose from a list of alternatives that already exists switch +may not be the best way to select one for evaluation. It is often +better to use eval and the subset operator, [[, directly +via eval(x[[condition]]). +

+
+ + + +

3.3 Elementary arithmetic operations

+ + + + + + + + +

In this section, we discuss the finer points of the rules that apply to +basic operation like addition or multiplication of two vectors or +matrices. +

+
+ + + +

3.3.1 Recycling rules

+

If one tries to add two structures with a different number of elements, +then the shortest is recycled to length of longest. That is, if for +instance you add c(1, 2, 3) to a six-element vector then you will +really add c(1, 2, 3, 1, 2, 3). If the length of the longer +vector is not a multiple of the shorter one, a warning is given. +

+

As from R 1.4.0, any arithmetic operation involving a zero-length +vector has a zero-length result. +

+
+ + + +

3.3.2 Propagation of names

+ +

propagation of names (first one wins, I think - also if it has no +names?? —- first one *with names* wins, recycling causes shortest to +lose names) +

+ +
+ + + +

3.3.3 Dimensional attributes

+ +

(matrix+matrix, dimensions must match. vector+matrix: first recycle, +then check if dims fit, error if not) +

+
+ + + +

3.3.4 NA handling

+ +

Missing values in the statistical sense, that is, variables whose value +is not known, have the value NA. This should not be confused with +the missing property for a function argument that has not been +supplied (see Arguments). + + + +

+ +

As the elements of an atomic vector must be of the same type there are +multiple types of NA values. There is one case where this is +particularly important to the user. The default type of NA is +logical, unless coerced to some other type, so the appearance of +a missing value may trigger logical rather than numeric indexing (see +Indexing for details). +

+

Numeric and logical calculations with NA generally return +NA. In cases where the result of the operation would be the same +for all possible values the NA could take, the operation may +return this value. In particular, ‘FALSE & NA’ is FALSE, +‘TRUE | NA’ is TRUE. NA is not equal to any other +value or to itself; testing for NA is done using is.na. + +However, an NA value will match another NA value in +match. +

+

Numeric calculations whose result is undefined, such as ‘0/0’, +produce the value NaN. This exists only in the double +type and for real or imaginary components of the complex type. The +function is.nan is provided to check specifically for + +NaN, is.na also returns TRUE for NaN. + +Coercing NaN to logical or integer type gives an NA of the +appropriate type, but coercion to character gives the string +"NaN". NaN values are incomparable so tests of equality +or collation involving NaN will result in NA. They are +regarded as matching any NaN value (and no other value, not even +NA) by match. +

+

The NA of character type is as from R 1.5.0 distinct from the +string "NA". Programmers who need to specify an explicit string +NA should use ‘as.character(NA)’ rather than "NA", or +set elements to NA using is.na<-. +

+

As from R 2.5.0 there are constants NA_integer_, +NA_real_, NA_complex_ and NA_character_ which will +generate (in the parser) an NA value of the appropriate type, +and will be used in deparsing when it is not otherwise possible to +identify the type of an NA (and the control options ask +for this to be done). +

+

There is no NA value for raw vectors. +

+ +
+ + + +

3.4 Indexing

+ +

R contains several constructs which allow access to individual +elements or subsets through indexing operations. In the case of the +basic vector types one can access the i-th element using x[i], +but there is also indexing of lists, matrices, and multi-dimensional +arrays. There are several forms of indexing in addition to indexing +with a single integer. Indexing can be used both to extract part of an +object and to replace parts of an object (or to add parts). +

+

R has three basic indexing operators, with syntax displayed by the +following examples +

+
+
x[i]
+x[i, j]
+x[[i]]
+x[[i, j]]
+x$a
+x$"a"
+
+ + + + + +

For vectors and matrices the [[ forms are rarely used, although +they have some slight semantic differences from the [ form (e.g. +it drops any names or dimnames attribute, and that partial +matching is used for character indices). When indexing +multi-dimensional structures with a single index, x[[i]] or +x[i] will return the ith sequential element of x. +

+

For lists, one generally uses [[ to select any single element, +whereas [ returns a list of the selected elements. +

+

The [[ form allows only a single element to be selected using +integer or character indices, whereas [ allows indexing by +vectors. Note though that for a list or other recursive object, the +index can be a vector and each element of the vector is applied in +turn to the list, the selected component, the selected component of +that component, and so on. The result is still a single element. +

+

The form using $ applies to recursive objects such as lists and +pairlists. It allows only a literal character string or a symbol as the +index. That is, the index is not computable: for cases where you need +to evaluate an expression to find the index, use x[[expr]]. When +$ is applied to a non-recursive object the result used to be +always NULL: as from R 2.6.0 this is an error. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: Indexing   [Contents][Index]

+
+ +

3.4.1 Indexing by vectors

+ +

R allows some powerful constructions using vectors as indices. We +shall discuss indexing of simple vectors first. For simplicity, assume +that the expression is x[i]. Then the following possibilities +exist according to the type of i. +

+
    +
  • +Integer. All elements of i must have the same sign. If +they are positive, the elements of x with those index numbers are +selected. If i contains negative elements, all elements except +those indicated are selected. + +

    If i is positive and exceeds length(x) then the +corresponding selection is NA. Negative out of bounds values +for i are silently disregarded since R version 2.6.0, S compatibly, +as they mean to drop non-existing elements and that is an empty operation +(“no-op”). +

    +

    A special case is the zero index, which has null effects: x[0] is +an empty vector and otherwise including zeros among positive or negative +indices has the same effect as if they were omitted. +

    +
  • Other numeric. Non-integer values are converted to integer +(by truncation towards zero) before use. + +
  • Logical. The indexing i should generally have the same +length as x. If it is shorter, then its elements will be +recycled as discussed in Elementary arithmetic operations. If it +is longer, then x is conceptually extended with NAs. The +selected values of x are those for which i is TRUE. + +
  • +Character. The strings in i are matched against the +names attribute of x and the resulting integers are used. For +[[ and $ partial matching is used if exact matching fails, +so x$aa will match x$aabb if x does not contain a component +named "aa" and "aabb" is the only name which has prefix +"aa". For [[, partial matching can be controlled via the +exact argument which defaults to NA indicating that +partial matching is allowed, but should result in a warning when it +occurs. Setting exact to TRUE prevents partial matching +from occurring, a FALSE value allows it and does not issue any +warnings. Note that [ always requires an exact match. The string +"" is treated specially: it indicates ‘no name’ and matches no +element (not even those without a name). Note that partial matching is +only used when extracting and not when replacing. + +
  • Factor. The result is identical to x[as.integer(i)]. +The factor levels are never used. If so desired, use +x[as.character(i)] or a similar construction. + +
  • Empty. The expression x[] returns x, but drops +“irrelevant” attributes from the result. Only names and in +multi-dimensional arrays dim and dimnames attributes are +retained. + +
  • NULL. This is treated as if it were integer(0). + +
+ +

Indexing with a missing (i.e. NA) value gives an NA +result. This rule applies also to the case of logical indexing, +i.e. the elements of x that have an NA selector in +i get included in the result, but their value will be NA. + +

+

Notice however, that there are different modes of NA—the +literal constant is of mode "logical", but it is frequently +automatically coerced to other types. One effect of this is that +x[NA] has the length of x, but x[c(1, NA)] has +length 2. That is because the rules for logical indices apply in the +former case, but those for integer indices in the latter. +

+

Indexing with [ will also carry out the relevant subsetting of +any names attributes. +

+
+ + + +

3.4.2 Indexing matrices and arrays

+ + +

Subsetting multi-dimensional structures generally follows the same rules +as single-dimensional indexing for each index variable, with the +relevant component of dimnames taking the place of names. +A couple of special rules apply, though: +

+

Normally, a structure is accessed using the number of indices +corresponding to its dimension. It is however also possible to use a +single index in which case the dim and dimnames attributes +are disregarded and the result is effectively that of c(m)[i]. +Notice that m[1] is usually very different from m[1, ] or +m[, 1]. +

+

It is possible to use a matrix of integers as an index. In this case, +the number of columns of the matrix should match the number of +dimensions of the structure, and the result will be a vector with length +as the number of rows of the matrix. The following example shows how +to extract the elements m[1, 1] and m[2, 2] in one +operation. +

+
+
> m <- matrix(1:4, 2)
+> m
+     [,1] [,2]
+[1,]    1    3
+[2,]    2    4
+> i <- matrix(c(1, 1, 2, 2), 2, byrow = TRUE)
+> i
+     [,1] [,2]
+[1,]    1    1
+[2,]    2    2
+> m[i]
+[1] 1 4
+
+ +

Indexing matrices may not contain negative indices. NA and +zero values are allowed: rows in an index matrix containing a zero are +ignored, whereas rows containing an NA produce an NA in +the result. +

+

Both in the case of using a single + +index and in matrix indexing, a names attribute is used if +present, as had the structure been one-dimensional. +

+

If an indexing operation causes the result to have one of its extents of +length one, as in selecting a single slice of a three-dimensional matrix +with (say) m[2, , ], the corresponding dimension is generally +dropped from the result. If a single-dimensional structure results, a +vector is obtained. This is occasionally undesirable and can be turned +off by adding the ‘drop = FALSE’ to the indexing operation. Notice +that this is an additional argument to the [ function and doesn’t +add to the index count. Hence the correct way of selecting the first +row of a matrix as a 1 by n matrix is m[1, , drop = +FALSE]. Forgetting to disable the dropping feature is a common cause +of failure in general subroutines where an index occasionally, but not +usually has length one. This rule still applies to a one-dimensional +array, where any subsetting will give a vector result unless ‘drop += FALSE’ is used. +

+

Notice that vectors are distinct from one-dimensional arrays in that the +latter have dim and dimnames attributes (both of length +one). One-dimensional arrays are not easily obtained from subsetting +operations but they can be constructed explicitly and are returned by +table. This is sometimes useful because the elements of the +dimnames list may themselves be named, which is not the case for +the names attribute. +

+

Some operations such as m[FALSE, ] result in structures in which +a dimension has zero extent. R generally tries to handle these +structures sensibly. +

+
+ + + +

3.4.3 Indexing other structures

+ +

The operator [ is a generic function which allows class methods +to be added, and the $ and [[ operators likewise. Thus, +it is possible to have user-defined indexing operations for any +structure. Such a function, say [.foo is called with a set of +arguments of which the first is the structure being indexed and the rest +are the indices. In the case of $, the index argument is of mode +"symbol" even when using the x$"abc" form. It is +important to be aware that class methods do not necessarily behave in +the same way as the basic methods, for example with respect to partial +matching. +

+

The most important example of a class method for [ is that used +for data frames. It is not described in detail here (see the help +page for [.data.frame, but in broad terms, if two indices are +supplied (even if one is empty) it creates matrix-like indexing for a +structure that is basically a list of vectors of the same length. If a +single index is supplied, it is interpreted as indexing the list of +columns—in that case the drop argument is ignored, with a +warning. +

+

The basic operators $ and [[ can be applied to +environments. Only character indices are allowed and no partial +matching is done. +

+ +
+ + + +

3.4.4 Subset assignment

+ + + +

Assignment to subsets of a structure is a special case of a general +mechanism for complex assignment: +

+
x[3:5] <- 13:15
+
+

The result of this command is as if the following had been executed +

+
`*tmp*` <- x
+x <- "[<-"(`*tmp*`, 3:5, value=13:15)
+rm(`*tmp*`)
+
+ +

Note that the index is first converted to a numeric index and then the +elements are replaced sequentially along the numeric index, as if a +for loop had been used. Any existing variable called +`*tmp*` will be overwritten and deleted, and this variable name +should not be used in code. +

+

The same mechanism can be applied to functions other than [. The +replacement function has the same name with <- pasted on. Its last +argument, which must be called value, is the new value to be +assigned. For example, +

+
names(x) <- c("a","b")
+
+

is equivalent to +

+
`*tmp*` <- x
+x <- "names<-"(`*tmp*`, value=c("a","b"))
+rm(`*tmp*`)
+
+ +

Nesting of complex assignments is evaluated recursively +

+
names(x)[3] <- "Three"
+
+

is equivalent to +

+
`*tmp*` <- x
+x <- "names<-"(`*tmp*`, value="[<-"(names(`*tmp*`), 3, value="Three"))
+rm(`*tmp*`)
+
+ + + +

Complex assignments in the enclosing environment (using <<-) are +also permitted: +

+
names(x)[3] <<- "Three"
+
+

is equivalent to +

+
`*tmp*` <<- get(x, envir=parent.env(), inherits=TRUE)
+names(`*tmp*`)[3] <- "Three"
+x <<- `*tmp*`
+rm(`*tmp*`)
+
+

and also to +

+
`*tmp*` <- get(x,envir=parent.env(), inherits=TRUE)
+x <<- "names<-"(`*tmp*`, value="[<-"(names(`*tmp*`), 3, value="Three"))
+rm(`*tmp*`)
+
+ +

Only the target variable is evaluated in the enclosing environment, so +

+
e<-c(a=1,b=2)
+i<-1
+local({
+   e <- c(A=10,B=11)
+   i <-2
+   e[i] <<- e[i]+1
+})
+
+

uses the local value of i on both the LHS and RHS, and the local +value of e on the RHS of the superassignment statement. It sets +e in the outer environment to +

+
 a  b 
+ 1 12
+
+

That is, the superassignment is equivalent to the four lines +

+
`*tmp*` <- get(e, envir=parent.env(), inherits=TRUE)
+`*tmp*`[i] <- e[i]+1
+e <<- `*tmp*`
+rm(`*tmp*`)
+
+ +

Similarly +

+
x[is.na(x)] <<- 0
+
+

is equivalent to +

+
`*tmp*` <- get(x,envir=parent.env(), inherits=TRUE)
+`*tmp*`[is.na(x)] <- 0
+x <<- `*tmp*`
+rm(`*tmp*`)
+
+

and not to +

+
`*tmp*` <- get(x,envir=parent.env(), inherits=TRUE)
+`*tmp*`[is.na(`*tmp*`)] <- 0
+x <<- `*tmp*`
+rm(`*tmp*`)
+
+

These two candidate interpretations differ only if there is also a +local variable x. It is a good idea to avoid having a local +variable with the same name as the target variable of a +superassignment. As this case was handled incorrectly in versions +1.9.1 and earlier there must not be a serious need for such code. +

+ + + + + + + + + + + + + + +
+ + + +

3.5 Scope of variables

+ + + +

Almost every programming language has a set of scoping rules, allowing +the same name to be used for different objects. This allows, e.g., a +local variable in a function to have the same name as a global object. +

+

R uses a lexical scoping model, similar to languages like +Pascal. However, R is a functional programming language and +allows dynamic creation and manipulation of functions and language +objects, and has additional features reflecting this fact. +

+ + + + + + + +
+ + + +

3.5.1 Global environment

+ +

The global + +environment is the root of the user workspace. An + +assignment operation from the command line will cause the relevant +object to belong to the global environment. Its enclosing environment +is the next environment on the search path, and so on back to the +empty environment that is the enclosure of the base environment. +

+
+ +
+

+Next: , Previous: , Up: Scope of variables   [Contents][Index]

+
+ +

3.5.2 Lexical environment

+ +

Every call to a + +function creates a + + +frame which contains the local +variables created in the function, and is evaluated in an environment, +which in combination creates a new environment. +

+

Notice the terminology: A frame is a set of variables, an environment is +a nesting of frames (or equivalently: the innermost frame plus the +enclosing environment). +

+

Environments may be assigned to variables or be contained in other +objects. However, notice that they are not standard objects—in +particular, they are not copied on assignment. +

+

A closure (mode "function") object will contain the environment +in which it is created as part of its definition (By default. The +environment can be manipulated using environment<-). When the +function is subsequently called, its + +evaluation environment is created with the closure’s environment as +enclosure. Notice that this is not +necessarily the environment of the caller! +

+

Thus, when a variable is requested inside a + +function, it is first sought +in the + +evaluation environment, then in the enclosure, the enclosure of +the enclosure, etc.; once the global environment or the environment of +a package is reached, the +search continues up the search path +to the environment of the base package. If the variable is not +found there, the search will proceed next to the empty environment, and +will fail. +

+
+ + + +

3.5.3 The call stack

+ +

Every time a + +function is invoked a new evaluation frame is created. At +any point in time during the computation the currently active +environments are accessible through the call stack. Each time a +function is invoked a special construct called a context is created +internally and is placed on a list of contexts. When a function has +finished evaluating its context is removed from the call stack. +

+

Making variables defined higher up the call stack available is called + +dynamic scope. The binding for a variable is then determined by the most +recent (in time) definition of the variable. This contradicts the +default scoping rules in R, which use the bindings in the + +environment +in which the function was defined (lexical scope). Some functions, +particularly those that use and manipulate model formulas, need to +simulate dynamic scope by directly accessing the call stack. +

+

Access to the + +call stack is provided through a family of functions which +have names that start with ‘sys.’. They are listed briefly below. +

+ +
+
sys.call
+

Get the call for the specified context. +

+
sys.frame
+

Get the evaluation frame for the specified context. +

+
sys.nframe
+

Get the environment frame for all active contexts. +

+
sys.function
+

Get the function being invoked in the specified context. +

+
sys.parent
+

Get the parent of the current function invocation. +

+
sys.calls
+

Get the calls for all the active contexts. +

+
sys.frames
+

Get the evaluation frames for all the active contexts. +

+
sys.parents
+

Get the numeric labels for all active contexts. +

+
sys.on.exit
+

Set a function to be executed when the specified context is exited. +

+
sys.status
+

Calls sys.frames, sys.parents and sys.calls. +

+
parent.frame
+

Get the evaluation frame for the specified parent context. +

+
+ +
+ +
+

+Previous: , Up: Scope of variables   [Contents][Index]

+
+ +

3.5.4 Search path

+ +

In addition to the evaluation + + +environment structure, R has a search +path of environments which are searched for variables not found +elsewhere. This is used for two things: packages of functions and +attached user data. +

+

The first element of the search path is the global environment and the +last is the base package. An Autoloads environment is used for +holding proxy objects that may be loaded on demand. Other environments +are inserted in the path using attach or library. +

+ +

Packages which have a namespace have a different search path. +When a search for an R object is started from an object in such a +package, the package itself is searched first, then its imports, then +the base namespace and finally the global environment and the rest of the +regular search path. The effect is that references to other objects in +the same package will be resolved to the package, and objects cannot be +masked by objects of the same name in the global environment or in other +packages. +

+ +
+ + + +

4 Functions

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: Functions   [Contents][Index]

+
+ +

4.1 Writing functions

+ +

While R can be very useful as a data analysis tool most users very +quickly find themselves wanting to write their own + +functions. This is +one of the real advantages of R. Users can program it and they can, +if they want to, change the system level functions to functions that +they find more appropriate. +

+

R also provides facilities that make it easy to document any +functions that you have created. See Writing R documentation in Writing R Extensions. +

+ + + + + +
+ +
+

+Next: , Previous: , Up: Writing functions   [Contents][Index]

+
+ +

4.1.1 Syntax and examples

+ +

The syntax for writing a + +function is +

+
+
function ( arglist ) body
+
+ +

The first component of the function declaration is the keyword +function which indicates to R that you want to create a +function. +

+

An + +argument list is a comma separated list of formal arguments. A +formal argument can be a symbol, a statement of the form +‘symbol = expression’, or the special formal argument +‘...’. +

+

The body can be any valid R expression. Generally, the body +is a group of expressions contained in curly braces (‘{’ and +‘}’). +

+

Generally + +functions are assigned to symbols but they don’t need to be. +The value returned by the call to function is a function. If +this is not given a name it is referred to as an + +anonymous +function. Anonymous functions are most frequently used as arguments to +other functions such as the apply family or outer. +

+

Here is a simple function: echo <- function(x) print(x). So +echo is a function that takes a single argument and when +echo is invoked it prints its argument. +

+
+ + + +

4.1.2 Arguments

+ +

The formal arguments to the function define the variables whose values +will be supplied at the time the function is invoked. The names of +these arguments can be used within the function body where they obtain +the value supplied at the time of function invocation. +

+ +

Default values for arguments can be specified using the special form +‘name = expression’. In this case, if the user does +not specify a value for the argument when the function is invoked the +expression will be associated with the corresponding symbol. When a +value is needed the expression is + +evaluated in the evaluation +frame of the function. +

+

Default behaviours can also be specified by using the function +missing. When missing is called with the + +name of a formal +argument it returns TRUE if the formal argument was not matched +with any actual argument and has not been subsequently modified in the +body of the function. An argument that is missing will thus +have its default value, if any. The missing function does not +force evaluation of the argument. +

+

The special type of argument ‘...’ can contain any number of +supplied arguments. It is used for a variety of purposes. It allows +you to write a + +function that takes an arbitrary number of arguments. It +can be used to absorb some arguments into an intermediate function which +can then be extracted by functions called subsequently. +

+
+ +
+

+Next: , Previous: , Up: Functions   [Contents][Index]

+
+ +

4.2 Functions as objects

+ +

Functions are first class objects in R. They can be used anywhere +that an R object is required. In particular they can be passed as +arguments to functions and returned as values from functions. See +Function objects for the details. +

+
+ +
+

+Previous: , Up: Functions   [Contents][Index]

+
+ +

4.3 Evaluation

+ + + + + + + + +
+ +
+

+Next: , Previous: , Up: Evaluation   [Contents][Index]

+
+ +

4.3.1 Evaluation environment

+ +

When a + +function is called or invoked a new + +evaluation frame is created. +In this frame the formal arguments are matched with the supplied +arguments according to the rules given in Argument matching. The +statements in the body of the function are evaluated sequentially in +this + +environment frame. +

+

The enclosing frame of the evaluation frame is the environment frame +associated with the function being invoked. This may be different from +S. While many functions have .GlobalEnv as their environment +this does not have to be true and functions defined in packages with +namespaces (normally) have the package namespace as their environment. +

+
+ + + +

4.3.2 Argument matching

+ +

This subsection applies to closures but not to primitive functions. The +latter typically ignore tags and do positional matching, but their help +pages should be consulted for exceptions, which include log, +round, signif, rep and seq.int. +

+

The first thing that occurs in a + +function evaluation is the matching of +formal to the actual or supplied arguments. +This is done by a three-pass process: +

+
    +
  1. Exact matching on tags. + +For each named supplied argument the list of formal arguments is +searched for an item whose name matches exactly. It is an error to have +the same formal argument match several actuals or vice versa. + +
  2. Partial matching on tags. +Each remaining named supplied argument is compared to the remaining formal +arguments using partial matching. If the name of the supplied argument +matches exactly with the first part of a formal argument then the two +arguments are considered to be matched. It is an error to have multiple +partial matches. Notice that if f <- function(fumble, +fooey) fbody, then f(f = 1, fo = 2) is illegal, even though the +2nd actual argument only matches fooey. f(f = 1, fooey = +2) is legal though since the second argument matches exactly and +is removed from consideration for partial matching. If the formal +arguments contain ‘...’ then partial matching is only applied to +arguments that precede it. + +
  3. Positional matching. +Any unmatched formal arguments are bound to unnamed supplied +arguments, in order. If there is a ‘...’ argument, it will take up +the remaining arguments, tagged or not. + +
+ +

If any arguments remain unmatched an error is declared. +

+

Argument matching is augmented by the functions match.arg, +match.call and match.fun. + + + +Access to the partial matching algorithm used by R is via +pmatch. +

+
+ +
+

+Next: , Previous: , Up: Evaluation   [Contents][Index]

+
+ +

4.3.3 Argument evaluation

+ +

One of the most important things to know about the + +evaluation of +arguments to a + +function is that supplied arguments and default arguments +are treated differently. The supplied arguments to a function are +evaluated in the evaluation frame of the calling function. The default +arguments to a function are evaluated in the evaluation frame of the +function. +

+

The semantics of invoking a function in R argument are +call-by-value. In general, supplied arguments behave as if they +are local variables initialized with the value supplied and the + +name of +the corresponding formal argument. Changing the value of a supplied +argument within a function will not affect the value of the variable in +the calling frame. +

+

R has a form of lazy evaluation of function arguments. Arguments are +not evaluated until needed. It is important to realize that in some +cases the argument will never be evaluated. Thus, it is bad style to +use arguments to functions to cause side-effects. While in C it is +common to use the form, foo(x = y) to invoke foo with the +value of y and simultaneously to assign the value of y to +x this same style should not be used in R. There is no +guarantee that the argument will ever be evaluated and hence the + +assignment may not take place. +

+

It is also worth noting that the effect of foo(x <- y) if the +argument is evaluated is to change the value of x in the calling + +environment and not in the + +evaluation environment of foo. +

+

It is possible to access the actual (not default) expressions used as +arguments inside the function. The mechanism is implemented via +promises. When a + +function is being evaluated the actual expression used as an argument is +stored in the promise together with a pointer to the environment the +function was called from. When (if) the argument is evaluated the +stored expression is evaluated in the environment that the function was +called from. Since only a pointer to the environment is used any +changes made to that environment will be in effect during this +evaluation. The resulting value is then also stored in a separate spot +in the promise. Subsequent evaluations retrieve this stored value (a +second evaluation is not carried out). Access to the unevaluated +expression is also available using substitute. +

+

When a + +function is called, each formal argument is assigned a promise in the +local environment of the call with the expression slot containing the +actual argument (if it exists) and the environment slot containing the +environment of the caller. If no actual argument for a formal argument +is given in the call and there is a default expression, it is similarly +assigned to the expression slot of the formal argument, but with the + +environment set +to the local environment. +

+

The process of filling the value slot of a promise by + +evaluating the +contents of the expression slot in the promise’s environment is called +forcing the promise. A promise will only be forced once, the +value slot content being used directly later on. +

+

A promise is forced when its value is needed. This usually happens +inside internal + + +functions, but a promise can also be forced by direct evaluation of the +promise itself. This is occasionally useful when a default expression +depends on the value of another formal argument or other variable in the +local environment. This is seen in the following example where the lone +label ensures that the label is based on the value of x +before it is changed in the next line. +

+
+
function(x, label = deparse(x)) {
+    label
+    x <- x + 1
+    print(label)
+}
+
+ +

The expression slot of a promise can itself involve other promises. +This happens whenever an unevaluated argument is passed as an argument +to another function. When forcing a promise, other promises in its +expression will also be forced recursively as they are evaluated. +

+
+ +
+

+Previous: , Up: Evaluation   [Contents][Index]

+
+ +

4.3.4 Scope

+ + +

Scope or the scoping rules are simply the set of rules used by the + +evaluator to find a value for a + +symbol. Every computer language has a +set of such rules. In R the rules are fairly simple but there do +exist mechanisms for subverting the usual, or default rules. +

+

R adheres to a set of rules that are called lexical scope. +This means the variable + +bindings in effect at the time the expression +was created are used to provide values for any unbound symbols in the +expression. +

+

Most of the interesting properties of + +scope are involved with evaluating + +functions and we concentrate on this issue. A symbol can be either + +bound or unbound. All of the formal arguments to a function provide +bound symbols in the body of the function. Any other symbols in the +body of the function are either local variables or unbound variables. A +local variable is one that is defined within the function. Because R +has no formal definition of variables, they are simply used as needed, +it can be difficult to determine whether a variable is local or not. +Local variables must first be defined, this is typically done by having +them on the left-hand side of an + +assignment. +

+

During the evaluation process if an unbound symbol is detected then R +attempts to find a value for it. The scoping rules determine how this +process proceeds. In R the + +environment of the function is searched +first, then its enclosure and so on until the global environment is reached. +

+

The global environment heads a search list of environments that are searched +sequentially for a matching symbol. The value of the first match is then used. +

+

When this set of rules is combined with the fact that + +functions can be +returned as values from other functions then some rather nice, but at +first glance peculiar, properties obtain. +

+

A simple example: +

+
+
f <- function() {
+    y <- 10
+    g <- function(x) x + y
+    return(g)
+}
+h <- f()
+h(3)
+
+ + +

A rather interesting question is what happens when h is +evaluated. To describe this we need a bit more notation. Within a + +function body variables can be bound, local or unbound. The bound +variables are those that match the formal arguments to the function. +The local variables are those that were created or defined within the +function body. The unbound variables are those that are neither local +nor bound. When a function body is evaluated there is no problem +determining values for local variables or for bound variables. Scoping +rules determine how the language will find values for the unbound +variables. +

+

When h(3) is evaluated we see that its body is that of +g. Within that body x is bound to the formal argument +and y is unbound. In a language with + +lexical scope x will be associated with the value 3 and +y with the value 10 local to f so h(3) should return the value 13. +In R this is indeed what happens. +

+

In S, because of the different scoping rules one will get an error +indicating that y is not found, unless there is a variable +y in your workspace in which case its value will be used. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

5 Object-oriented programming

+ + +

Object-oriented programming is a style of programming that has become +popular in recent years. Much of the popularity comes from the fact +that it makes it easier to write and maintain complicated systems. It +does this through several different mechanisms. +

+

Central to any object-oriented language are the concepts of class and of +methods. A class is a definition of an object. Typically a +class contains several slots that are used to hold class-specific +information. An object in the language must be an instance of some +class. Programming is based on objects or instances of classes. +

+

Computations are carried out via methods. Methods are basically + +functions that are specialized to carry out specific calculations on +objects, usually of a specific class. This is what makes the language +object oriented. In R, generic functions are used to +determine the appropriate method. The generic function is responsible +for determining the class of its argument(s) and uses that information +to select the appropriate method. +

+

Another feature of most object-oriented languages is the concept of +inheritance. In most programming problems there are usually many +objects that are related to one another. The programming is +considerably simplified if some components can be reused. +

+

If a class inherits from another class then generally it gets all the +slots in the parent class and can extend it by adding new slots. On +method dispatching (via the generic functions) if a method for the class +does not exist then a method for the parent is sought. +

+

In this chapter we discuss how this general strategy has been +implemented in R and discuss some of the limitations within the +current design. One of the advantages that most object systems impart +is greater consistency. This is achieved via the rules that are checked +by the compiler or interpreter. Unfortunately because of the way that +the object system is incorporated into R this advantage does not +obtain. Users are cautioned to use the object system in a +straightforward manner. While it is possible to perform some rather +interesting feats these tend to lead to obfuscated code and may depend +on implementation details that will not be carried forward. +

+

The greatest use of object oriented programming in R is through +print methods, summary methods and plot methods. +These methods allow us to have one generic + +function call, plot +say, that dispatches on the type of its argument and calls a plotting +function that is specific to the data supplied. +

+

In order to make the concepts clear we will consider the implementation +of a small system designed to teach students about probability. In this +system the objects are probability functions and the methods we will +consider are methods for finding moments and for plotting. +Probabilities can always be represented in terms of the cumulative +distribution function but can often be represented in other ways. For +example as a density, when it exists or as a moment generating function +when it exists. +

+ + + + + + + + + + + +
+ + + +

5.1 Definition

+ +

Rather than having a full-fledged + +object-oriented system R has a +class system and a mechanism for dispatching based on the class of an +object. The dispatch mechanism for interpreted code relies on four +special objects that are stored in the evaluation frame. These special +objects are .Generic, .Class, .Method and +.Group. There is a separate dispatch mechanism used for internal +functions and types that will be discussed elsewhere. +

+

The class system is facilitated through the class attribute. +This attribute is a character vector of class names. So to create an +object of class "foo" one simply attaches a class attribute with +the string ‘"foo"’ in it. Thus, virtually anything can be turned +in to an object of class "foo". +

+

The object system makes use of + +generic functions via two +dispatching functions, UseMethod and NextMethod. The +typical use of the object system is to begin by calling a generic +function. This is typically a very simple function and consists of a +single line of code. The system function mean is just such a +function, +

+
+
> mean
+function (x, ...)
+UseMethod("mean")
+
+ +

When mean is called it can have any number of arguments but its +first argument is special and the class of that first argument is used +to determine which method should be called. The variable .Class +is set to the class attribute of x, .Generic is set to the +string "mean" and a search is made for the correct method to +invoke. The class attributes of any other arguments to mean are +ignored. +

+

Suppose that x had a class attribute that contained "foo" +and "bar", in that order. Then R would first search for a +function called mean.foo and if it did not find one it would then +search for a function mean.bar and if that search was also +unsuccessful then a final search for mean.default would be made. +If the last search is unsuccessful R reports an error. It is a good +idea to always write a default method. Note that the functions +mean.foo etc. are referred to, in this context, as methods. +

+

NextMethod provides another mechanism for dispatching. A + +function may have a call to NextMethod anywhere in it. The +determination of which method should then be invoked is based primarily +on the current values of .Class and .Generic. This is +somewhat problematic since the method is really an ordinary function and +users may call it directly. If they do so then there will be no values +for .Generic or .Class. +

+

If a method is invoked directly and it contains a call to +NextMethod then the first argument to NextMethod is used +to determine the + +generic function. An error is signalled if this +argument has not been supplied; it is therefore a good idea to always +supply this argument. +

+

In the case that a method is invoked directly the class attribute of the +first argument to the method is used as the value of .Class. +

+

Methods themselves employ NextMethod to provide a form of +inheritance. Commonly a specific method performs a few operations to +set up the data and then it calls the next appropriate method through a +call to NextMethod. +

+ + + + +

Consider the following simple example. A point in two-dimensional +Euclidean space can be specified by its Cartesian (x-y) or polar +(r-theta) coordinates. Hence, to store information about the location +of the point, we could define two classes, "xypoint" and +"rthetapoint". All the ‘xypoint’ data structures are lists with +an x-component and a y-component. All ‘rthetapoint’ objects are lists +with an r-component and a theta-component. +

+

Now, suppose we want to get the x-position from either type of object. +This can easily be achieved through + +generic functions. We define the +generic function xpos as follows. +

+
+
xpos <- function(x, ...)
+    UseMethod("xpos")
+
+ +

Now we can define methods: +

+
+
xpos.xypoint <- function(x) x$x
+xpos.rthetapoint <- function(x) x$r * cos(x$theta)
+
+ +

The user simply calls the function xpos with either +representation as the argument. The internal dispatching method finds +the class of the object and calls the appropriate methods. +

+

It is pretty easy to add other representations. One need not write a +new generic function only the methods. This makes it easy to add to +existing systems since the user is only responsible for dealing with the +new representation and not with any of the existing representations. +

+

The bulk of the uses of this methodology are to provide specialized +printing for objects of different types; there are about 40 methods for +print. +

+
+ + + +

5.2 Inheritance

+ + +

The class attribute of an object can have several elements. When a + +generic function is called the first inheritance is mainly handled +through NextMethod. NextMethod determines the method +currently being evaluated, finds the next class from th +

+

FIXME: something is missing here +

+
+ + + +

5.3 Method dispatching

+ + +

Generic functions should consist of a single statement. They should +usually be of the form foo <- function(x, ...) UseMethod("foo", +x). When UseMethod is called, it determines the appropriate +method and then that method is invoked with the same arguments, in +the same order as the call to the generic, as if the call had been made +directly to the method. +

+

In order to determine the correct method the class attribute of the +first argument to the generic is obtained and used to find the correct +method. The + +name of the generic function is combined with the first element of the +class attribute into the form, generic.class and a +function with that name is sought. If the function is found then it is +used. If no such function is found then the second element of the class +attribute is used, and so on until all the elements of the class +attribute have been exhausted. If no method has been found at that +point then the method generic.default is used. If +the first argument to the generic function has no class attribute then +generic.default is used. Since the introduction of +namespaces the methods may not be accessible by their names +(i.e. get("generic.class") may fail), but they will +be accessible by getS3method("generic","class"). +

+ +

Any object can have a class attribute. This attribute can have +any number of elements. Each of these is a string that defines a class. +When a generic function is invoked the class of its first argument is +examined. +

+
+ + + +

5.4 UseMethod

+ + +

UseMethod is a special function and it behaves differently from +other function calls. The syntax of a call to it is +UseMethod(generic, object), where generic is +the name of the generic function, object is the object used to +determine which method should be chosen. UseMethod can only be +called from the body of a function. +

+ +

UseMethod changes the evaluation model in two ways. First, when +it is invoked it determines the next method (function) to be called. It +then invokes that function using the current evaluation + +environment; this process will be described shortly. The second way in +which UseMethod changes the evaluation environment is that it +does not return control to the calling function. This means, that any +statements after a call to UseMethod are guaranteed not to be +executed. +

+

When UseMethod is invoked the generic function is the specified +value in the call to UseMethod. The object to dispatch on is +either the supplied second argument or the first argument to the current +function. The class of the argument is determined and the first element +of it is combined with the name of the generic to determine the +appropriate method. So, if the generic had name foo and the +class of the object is "bar", then R will search for a method +named foo.bar. If no such method exists then the inheritance +mechanism described above is used to locate an appropriate method. +

+

Once a method has been determined R invokes it in a special way. +Rather than creating a new evaluation + +environment R uses the +environment of the current function call (the call to the generic). Any + +assignments or evaluations that were made before the call to +UseMethod will be in effect. The arguments that were used in the +call to the generic are rematched to the formal arguments of the +selected method. +

+

When the method is invoked it is called with arguments that are the same +in number and have the same names as in the call to the generic. They +are matched to the arguments of the method according to the standard +R rules for argument matching. However the object, i.e. the first +argument has been evaluated. +

+

The call to UseMethod has the effect of placing some special +objects in the evaluation frame. They are .Class, +.Generic and .Method. These special objects are used to +by R to handle the method dispatch and inheritance. .Class is +the class of the object, .Generic is the name of the generic +function and .Method is the name of the method currently being +invoked. If the method was invoked through one of the internal +interfaces then there may also be an object called .Group. This +will be described in Section Group methods. After the initial +call to UseMethod these special variables, not the object itself, +control the selection of subsequent methods. +

+

The body of the method is then evaluated in the standard fashion. In +particular variable look-up in the body follows the rules for the +method. So if the method has an associated environment then that is +used. In effect we have replaced the call to the generic by a call to +the method. Any local + +assignments in the frame of the generic will be +carried forward into the call to the method. Use of this feature +is discouraged. It is important to realize that control will never +return to the generic and hence any expressions after a call to +UseMethod will never be executed. +

+

Any arguments to the generic that were evaluated prior to the call to +UseMethod remain evaluated. +

+

If the first argument to UseMethod is not supplied it is assumed +to be the name of the current function. If two arguments are supplied +to UseMethod then the first is the name of the method and the +second is assumed to be the object that will be dispatched on. It is +evaluated so that the required method can be determined. In this case +the first argument in the call to the generic is not evaluated and is +discarded. There is no way to change the other arguments in the call to +the method; these remain as they were in the call to the generic. This +is in contrast to NextMethod where the arguments in the call to +the next method can be altered. +

+
+ + + +

5.5 NextMethod

+ + +

NextMethod is used to provide a simple inheritance mechanism. +

+

Methods invoked as a result of a call to NextMethod behave as if +they had been invoked from the previous method. The arguments to the +inherited method are in the same order and have the same names as the +call to the current method. This means that they are the same as for +the call to the generic. However, the expressions for the arguments are +the names of the corresponding formal arguments of the current method. +Thus the arguments will have values that correspond to their value at +the time NextMethod was invoked. +

+

Unevaluated arguments remain unevaluated. Missing arguments remain +missing. +

+

The syntax for a call to NextMethod is NextMethod(generic, +object, ...). If the generic is not supplied the value of +.Generic is used. If the object is not supplied the first +argument in the call to the current method is used. Values in the +‘...’ argument are used to modify the arguments of the next method. +

+

It is important to realize that the choice of the next method depends on +the current values of .Generic and .Class and not on the +object. So changing the object in a call to NextMethod affects +the arguments received by the next method but does not affect the choice +of the next method. +

+

Methods can be called directly. If they are then there will be no +.Generic, .Class or .Method. In this case the +generic argument of NextMethod must be specified. The +value of .Class is taken to be the class attribute of the object +which is the first argument to the current function. The value of +.Method is the name of the current function. These choices for +default values ensure that the behaviour of a method doesn’t change +depending on whether it is called directly or via a call to a generic. +

+ +

An issue for discussion is the behaviour of the ‘...’ argument to +NextMethod. The White Book describes the behaviour as follows: +

+ +

- named arguments replace the corresponding arguments in the call to + the current method. Unnamed arguments go at the start of the argument + list. +

+

What I would like to do is: +

+

-first do the argument matching for NextMethod; + -if the object or generic are changed fine + -first if a named list element matches an argument (named or not) the + list value replaces the argument value. + - the first unnamed list element +

+

Values for lookup: + Class: comes first from .Class, second from the first argument to the + method and last from the object specified in the call to NextMethod +

+

Generic: comes first from .Generic, if nothing then from the first + argument to the method and if it’s still missing from the call to + NextMethod +

+

Method: this should just be the current function name. +

+ + + +
+ + + +

5.6 Group methods

+ +

For several types of + +internal functions R provides a dispatching +mechanism for operators. This means that operators such as == or +< can have their behaviour modified for members of special +classes. The functions and operators have been grouped into three +categories and group methods can be written for each of these +categories. There is currently no mechanism to add groups. It is +possible to write methods specific to any function within a group. +

+

The following table lists the functions for the different Groups. +

+
+
Math
+

abs, acos, acosh, asin, asinh, atan, atanh, ceiling, cos, cosh, cospi, cumsum, +exp, floor, gamma, lgamma, log, log10, round, signif, sin, sinh, sinpi, +tan, tanh, tanpi, trunc +

+
+
Summary
+

all, any, max, min, prod, range, sum +

+
+
Ops
+

+, -, *, /, ^, < , >, +<=, >=, !=, ==, %%, %/%, +&, |, ! +

+
+ +

For operators in the Ops group a special method is invoked if the two +operands taken together suggest a single method. Specifically, if both +operands correspond to the same method or if one operand corresponds to +a method that takes precedence over that of the other operand. If they +do not suggest a single method then the default method is used. Either +a group method or a class method dominates if the other operand has no +corresponding method. A class method dominates a group method. +

+

When the group is Ops the special variable .Method is a string +vector with two elements. The elements of .Method are set to the +name of the method if the corresponding argument is a member of the +class that was used to determine the method. Otherwise the +corresponding element of .Method is set to the zero length +string, "". +

+
+ + + +

5.7 Writing methods

+ +

Users can easily write their own methods and generic functions. A + +generic function is simply a function with a call to UseMethod. +A method is simply a function that has been invoked via method dispatch. +This can be as a result of a call to either UseMethod or +NextMethod. +

+

It is worth remembering that methods can be called directly. That means +that they can be entered without a call to UseMethod having been +made and hence the special variables .Generic, .Class and +.Method will not have been instantiated. In that case the +default rules detailed above will be used to determine these. +

+

The most common use of + +generic functions is to provide print and +summary methods for statistical objects, generally the output of +some model fitting process. To do this, each model attaches a class +attribute to its output and then provides a special method that takes +that output and provides a nice readable version of it. The user then +needs only remember that print or summary will provide +nice output for the results of any analysis. +

+ + + + +
+ + + +

6 Computing on the language

+ +

R belongs to a class of programming languages in which subroutines +have the ability to modify or construct other subroutines and evaluate +the result as an integral part of the language itself. This is similar +to Lisp and Scheme and other languages of the “functional programming” +variety, but in contrast to FORTRAN and the ALGOL family. The Lisp +family takes this feature to the extreme by the “everything is a list” +paradigm in which there is no distinction between programs and data. +

+

R presents a friendlier interface to programming than Lisp does, at +least to someone used to mathematical formulas and C-like control +structures, but the engine is really very Lisp-like. R allows direct +access to + +parsed expressions and functions and allows you to alter and +subsequently execute them, or create entirely new functions from +scratch. +

+

There is a number of standard applications of this facility, such as +calculation of analytical derivatives of expressions, or the generation +of polynomial functions from a vector of coefficients. However, there +are also uses that are much more fundamental to the workings of the +interpreted part of R. Some of these are essential to the reuse of +functions as components in other functions, as the (admittedly not very +pretty) calls to model.frame that are constructed in several +modeling and plotting routines. Other uses simply allow elegant +interfaces to useful functionality. As an example, consider the +curve function, which allows you to draw the graph of a function +given as an expression like sin(x) or the facilities for plotting +mathematical expressions. +

+

In this chapter, we give an introduction to the set of facilities that +are available for computing on the language. +

+ + + + + + + + + +
+ + + +

6.1 Direct manipulation of language objects

+ +

There are three kinds of language objects that are available for +modification, calls, expressions, and functions. At this point, we +shall concentrate on the call objects. These are sometimes referred to +as “unevaluated expressions”, although this terminology is somewhat +confusing. The most direct method of obtaining a call object is to use +quote with an expression argument, e.g., +

+
+
> e1 <- quote(2 + 2)
+> e2 <- quote(plot(x, y))
+
+ +

The arguments are not evaluated, the result is simply the parsed +argument. The objects e1 and e2 may be evaluated later +using eval, or simply manipulated as data. It is perhaps most +immediately obvious why the e2 object has mode "call", +since it involves a call to the plot function with some +arguments. However, e1 actually has exactly the same structure +as a call to the binary operator + with two arguments, a fact +that gets clearly displayed by the following +

+
+
> quote("+"(2, 2))
+2 + 2
+
+ +

The components of a call object are accessed using a list-like syntax, +and may in fact be converted to and from lists using as.list and +as.call +

+
+
> e2[[1]]
+plot
+> e2[[2]]
+x
+> e2[[3]]
+y
+
+ +

When keyword argument matching is used, the keywords can be used as list +tags: +

+
+
> e3 <- quote(plot(x = age, y = weight))
+> e3$x
+age
+> e3$y
+weight
+
+ +

All the components of the call object have mode "name" in the +preceding examples. This is true for identifiers in calls, but the +components of a call can also be constants—which can be of any type, +although the first component had better be a function if the call is to +be evaluated successfully—or other call objects, corresponding to +subexpressions. Objects of mode + +name can be constructed from character +strings using as.name, so one might modify the e2 object +as follows +

+
+
> e2[[1]] <- as.name("+")
+> e2
+x + y
+
+ +

To illustrate the fact that subexpressions are simply components that +are themselves calls, consider +

+
+
> e1[[2]] <- e2
+> e1
+x + y + 2
+
+ + +

All grouping parentheses in input are preserved in parsed expressions. +They are represented as a function call with one argument, so that +4 - (2 - 2) becomes "-"(4, "(" ("-"(2, 2))) in prefix +notation. In evaluations, the ‘(’ operator just returns its +argument. +

+

This is a bit unfortunate, but it is not easy to write a + +parser/deparser +combination that both preserves user input, stores it in minimal form +and ensures that parsing a deparsed expression gives the same expression +back. +

+

As it happens, R’s parser is not perfectly invertible, nor is its +deparser, as the following examples show +

+
+
> str(quote(c(1,2)))
+ language c(1, 2)
+> str(c(1,2))
+ num [1:2] 1 2
+> deparse(quote(c(1,2)))
+[1] "c(1, 2)"
+> deparse(c(1,2))
+[1] "c(1, 2)"
+> quote("-"(2, 2))
+2 - 2
+> quote(2 - 2)
+2 - 2
+
+ +

Deparsed expressions should, however, evaluate to an equivalent value +to the original expression (up to rounding error). +

+

...internal storage of flow control constructs...note Splus +incompatibility... +

+
+ + + +

6.2 Substitutions

+ +

It is in fact not often that one wants to modify the innards of an +expression like in the previous section. More frequently, one wants to +simply get at an expression in order to deparse it and use it for +labeling plots, for instance. An example of this is seen at the +beginning of plot.default: + +

+
+
xlabel <- if (!missing(x))
+    deparse(substitute(x))
+
+ +

This causes the variable or expression given as the x argument to +plot to be used for labeling the x-axis later on. +

+

The function used to achieve this is substitute which takes the +expression x and substitutes the expression that was passed +through the formal argument x. Notice that for this to happen, +x must carry information about the expression that creates its +value. This is related to the + +lazy evaluation scheme of R +(see Promise objects). A formal argument is really a +promise, an object with three slots, one for the expression that +defines it, one for the environment in which to evaluate that expression, +and one for the value of that expression once evaluated. substitute +will recognize a promise variable and substitute the value of its +expression slot. If substitute is invoked inside a function, the +local variables of the function are also subject to substitution. +

+

The argument to substitute does not have to be a simple +identifier, it can be an expression involving several variables and +substitution will occur for each of these. Also, substitute has +an additional argument which can be an environment or a list in which +the variables are looked up. For example: +

+
+
> substitute(a + b, list(a = 1, b = quote(x)))
+1 + x
+
+ +

Notice that quoting was necessary to substitute the x. This kind +of construction comes in handy in connection with the facilities for +putting math expression in graphs, as the following case shows +

+
+
> plot(0)
+> for (i in 1:4)
++   text(1, 0.2 * i,
++        substitute(x[ix] == y, list(ix = i, y = pnorm(i))))
+
+ +

It is important to realize that the substitutions are purely lexical; +there is no checking that the resulting call objects make sense if they +are evaluated. substitute(x <- x + 1, list(x = 2)) will happily +return 2 <- 2 + 1. However, some parts of R make up their own +rules for what makes sense and what does not and might actually have a +use for such ill-formed expressions. For example, using the “math in +graphs” feature often involves constructions that are syntactically +correct, but which would be meaningless to evaluate, like +‘{}>=40*" years"’. +

+

Substitute will not evaluate its first argument. This leads to the +puzzle of how to do substitutions on an object that is contained in a +variable. The solution is to use substitute once more, like this +

+
+
> expr <- quote(x + y)
+> substitute(substitute(e, list(x = 3)), list(e = expr))
+substitute(x + y, list(x = 3))
+> eval(substitute(substitute(e, list(x = 3)), list(e = expr)))
+3 + y
+
+ +

The exact rules for substitutions are as follows: Each + +symbol in the + +parse tree for the first is matched against the second argument, which +can be a tagged list or an environment frame. If it is a simple local +object, its value is inserted, except if matching against the +global environment. If it is a promise (usually a function argument), +the promise expression is substituted. If the symbol is not matched, it +is left untouched. The special exception for substituting at the top +level is admittedly peculiar. It has been inherited from S and the +rationale is most likely that there is no control over which variables +might be bound at that level so that it would be better to just make +substitute act as quote. +

+

The rule of promise substitution is slightly different from that of +S if the local variable is modified before substitute is +used. R will then use the new value of the variable, whereas S +will unconditionally use the argument expression—unless it was a +constant, which has the curious consequence that f((1)) may be +very different from f(1) in S. The R rule is considerably +cleaner, although it does have consequences in connection with + +lazy +evaluation that comes as a surprise to some. Consider +

+
+
logplot <- function(y, ylab = deparse(substitute(y))) {
+    y <- log(y)
+    plot(y, ylab = ylab)
+}
+
+ +

This looks straightforward, but one will discover that the y label +becomes an ugly c(...) expression. It happens because the rules +of lazy evaluation cause the evaluation of the ylab expression +to happen after y has been modified. The solution is to +force ylab to be evaluated first, i.e., +

+
+
logplot <- function(y, ylab = deparse(substitute(y))) {
+    ylab
+    y <- log(y)
+    plot(y, ylab = ylab)
+}
+
+ +

Notice that one should not use eval(ylab) in this situation. If +ylab is a language or expression object, then that would cause +the object to be evaluated as well, which would not at all be desirable +if a math expression like quote(log[e](y)) was being passed. +

+ +

A variant on substitute is bquote, which is used to replace some subexpressions with their values. The example from above +

+
> plot(0)
+> for (i in 1:4)
++   text(1, 0.2 * i,
++        substitute(x[ix] == y, list(ix = i, y = pnorm(i))))
+
+

could be written more compactly as +

+
plot(0)
+for(i in 1:4)
+   text(1, 0.2*i, bquote( x[.(i)] == .(pnorm(i)) ))
+
+ +

The expression is quoted except for the contents of .() +subexpressions, which are replaced with their values. There is an +optional argument to compute the values in a different +environment. The syntax for bquote is borrowed from the LISP +backquote macro. +

+
+ + + +

6.3 More on evaluation

+ + +

The eval function was introduced earlier in this chapter as a +means of evaluating call objects. However, this is not the full story. +It is also possible to specify the + +environment in which the evaluation +is to take place. By default this is the evaluation frame from which +eval is called, but quite frequently it needs to be set to +something else. + +

+

Very often, the relevant evaluation frame is that of the parent of the +current frame (cf. ???). In particular, when the object to evaluate +is the result of a substitute operation of the function +arguments, it will contain variables that make sense to the caller only +(notice that there is no reason to expect that the variables of the +caller are in the + +lexical scope of the callee). Since evaluation in the +parent frame occurs frequently, an eval.parent function exists as +a shorthand for eval(expr, sys.frame(sys.parent())). +

+

Another case that occurs frequently is evaluation in a list or a data +frame. For instance, this happens in connection with the +model.frame function when a data argument is given. +Generally, the terms of the model formula need to be evaluated in +data, but they may occasionally also contain references to items +in the caller of model.frame. This is sometimes useful in +connection with simulation studies. So for this purpose one needs not +only to evaluate an expression in a list, but also to specify an +enclosure into which the search continues if the variable is not in the +list. Hence, the call has the form +

+
+
eval(expr, data, sys.frame(sys.parent()))
+
+ +

Notice that evaluation in a given environment may actually change that +environment, most obviously in cases involving the + +assignment operator, +such as +

+
+
eval(quote(total <- 0), environment(robert$balance)) # rob Rob
+
+ +

This is also true when evaluating in lists, but the original list does +not change because one is really working on a copy. +

+ +
+ + + +

6.4 Evaluation of expression objects

+ +

Objects of mode "expression" are defined in Expression objects. They are very similar to lists of call objects. +

+
+
> ex <- expression(2 + 2, 3 + 4)
+> ex[[1]]
+2 + 2
+> ex[[2]]
+3 + 4
+> eval(ex)
+[1] 7
+
+ +

Notice that evaluating an expression object evaluates each call in turn, +but the final value is that of the last call. In this respect it +behaves almost identically to the compound language object +quote({2 + 2; 3 + 4}). However, there is a subtle difference: +Call objects are indistinguishable from subexpressions in a parse tree. +This means that they are automatically evaluated in the same way a +subexpression would be. Expression objects can be recognized during +evaluation and in a sense retain their quotedness. The evaluator will +not evaluate an expression object recursively, only when it is passed +directly to eval function as above. The difference can be seen +like this: +

+
+
> eval(substitute(mode(x), list(x = quote(2 + 2))))
+[1] "numeric"
+> eval(substitute(mode(x), list(x = expression(2 + 2))))
+[1] "expression"
+
+ +

The deparser represents an expression object by the call +that creates it. This is similar to the way it handles numerical +vectors and several other objects that do not have a specific external +representation. However, it does lead to the following bit of +confusion: +

+
+
> e <- quote(expression(2 + 2))
+> e
+expression(2 + 2)
+> mode(e)
+[1] "call"
+> ee <- expression(2 + 2)
+> ee
+expression(2 + 2)
+> mode(ee)
+[1] "expression"
+
+ +

I.e., e and ee look identical when printed, but one is a +call that generates an expression object and the other is the object +itself. +

+
+ + + +

6.5 Manipulation of function calls

+ +

It is possible for a + +function to find out how it has been called by +looking at the result of sys.call as in the following example of +a function that simply returns its own call: +

+
+
> f <- function(x, y, ...) sys.call()
+> f(y = 1, 2, z = 3, 4)
+f(y = 1, 2, z = 3, 4)
+
+ +

However, this is not really useful except for debugging because it +requires the function to keep track of argument matching in order to +interpret the call. For instance, it must be able to see that the 2nd +actual argument gets matched to the first formal one (x in the +above example). +

+

More often one requires the call with all actual arguments bound to the +corresponding formals. To this end, the function match.call is +used. Here’s a variant of the preceding example, a function that +returns its own call with arguments matched +

+
+
> f <- function(x, y, ...) match.call()
+> f(y = 1, 2, z = 3, 4)
+f(x = 2, y = 1, z = 3, 4)
+
+ +

Notice that the second argument now gets matched to x and appears +in the corresponding position in the result. +

+

The primary use of this technique is to call another function with the +same arguments, possibly deleting some and adding others. A typical +application is seen at the start of the lm function: +

+
+
    mf <- cl <- match.call()
+    mf$singular.ok <- mf$model <- mf$method <- NULL
+    mf$x <- mf$y <- mf$qr <- mf$contrasts <- NULL
+    mf$drop.unused.levels <- TRUE
+    mf[[1]] <- as.name("model.frame")
+    mf <- eval(mf, sys.frame(sys.parent()))
+
+ +

Notice that the resulting call is + +evaluated in the parent frame, in +which one can be certain that the involved expressions make sense. The +call can be treated as a list object where the first element is the name +of the function and the remaining elements are the actual argument +expressions, with the corresponding formal argument names as tags. +Thus, the technique to eliminate undesired arguments is to assign +NULL, as seen in lines 2 and 3, and to add an argument one uses +tagged list + +assignment (here to pass drop.unused.levels = TRUE) +as in line 4. To change the name of the function called, assign to the +first element of the list and make sure that the value is a name, either +using the as.name("model.frame") construction here or +quote(model.frame). +

+

The match.call function has an expand.dots argument which +is a switch which if set to FALSE lets all ‘...’ arguments +be collected as a single argument with the tag ‘...’. + +

+
+
> f <- function(x, y, ...) match.call(expand.dots = FALSE)
+> f(y = 1, 2, z = 3, 4)
+f(x = 2, y = 1, ... = list(z = 3, 4))
+
+ +

The ‘...’ argument is a list (a pairlist to be precise), not a call +to list like it is in S: +

+
+
> e1 <- f(y = 1, 2, z = 3, 4)$...
+> e1
+$z
+[1] 3
+
+[[2]]
+[1] 4
+
+ +

One reason for using this form of match.call is simply to get rid +of any ‘...’ arguments in order not to be passing unspecified +arguments on to functions that may not know them. Here’s an example +paraphrased from plot.formula: +

+
+
m <- match.call(expand.dots = FALSE)
+m$... <- NULL
+m[[1]] <- "model.frame"
+
+ +

A more elaborate application is in update.default where a set of +optional extra arguments can add to, replace, or cancel those of the +original call: +

+
+
extras <- match.call(expand.dots = FALSE)$...
+if (length(extras) > 0) {
+    existing <- !is.na(match(names(extras), names(call)))
+    for (a in names(extras)[existing]) call[[a]] <- extras[[a]]
+    if (any(!existing)) {
+        call <- c(as.list(call), extras[!existing])
+        call <- as.call(call)
+    }
+}
+
+ +

Notice that care is taken to modify existing arguments individually in +case extras[[a]] == NULL. Concatenation does not work on call +objects without the coercion as shown; this is arguably a bug. +

+

Two further functions exist for the construction of function calls, +namely call and do.call. +

+

The function call allows creation of a call object from the +function name and the list of arguments +

+
+
> x <- 10.5
+> call("round", x)
+round(10.5)
+
+ +

As seen, the value of x rather than the + +symbol is inserted in the +call, so it is distinctly different from round(x). The form is +used rather rarely, but is occasionally useful where the name of a +function is available as a character variable. +

+

The function do.call is related, but evaluates the call immediately +and takes the arguments from an object of mode "list" containing +all the arguments. A natural use of this is when one wants to apply a +function like cbind to all elements of a list or data frame. + +

+
+
is.na.data.frame <- function (x) {
+    y <- do.call("cbind", lapply(x, "is.na"))
+    rownames(y) <- row.names(x)
+    y
+}
+
+ +

Other uses include variations over constructions like do.call("f", +list(...)). However, one should be aware that this involves evaluation +of the arguments before the actual function call, which may defeat +aspects of lazy evaluation and argument substitution in the function +itself. A similar remark applies to the call function. +

+ +
+ + + +

6.6 Manipulation of functions

+ +

It is often useful to be able to manipulate the components of a + +function +or closure. R provides a set of interface functions for this +purpose. +

+
+
body + +
+

Returns the expression that is the body of the function. +

+
formals + +
+

Returns a list of the formal arguments to the function. This is a +pairlist. +

+
environment + +
+
+

Returns the environment associated with the function. +

+
body<- + +
+

This sets the body of the function to the supplied expression. +

+
formals<- + +
+

Sets the formal arguments of the function to the supplied list. +

+
environment<- + +
+

Sets the environment of the function to the specified environment. +

+
+ +

It is also possible to alter the bindings of different variables in the +environment of the function, using code along the lines of evalq(x +<- 5, environment(f)). +

+

It is also possible to convert a + +function to a list using +as.list. The result is the concatenation of the list of formal +arguments with the function body. Conversely such a list can be +converted to a function using as.function. This functionality is +mainly included for S compatibility. Notice that environment +information is lost when as.list is used, whereas +as.function has an argument that allows the environment to be +set. +

+
+ + + +

7 System and foreign language interfaces

+ + + + + + + +
+ + + +

7.1 Operating system access

+ +

Access to the operating system shell is via the R function +system. + +The details will differ by platform (see the on-line help), and about +all that can safely be assumed is that the first argument will be a +string command that will be passed for execution (not necessarily +by a shell) and the second argument will be internal which if +true will collect the output of the command into an R character +vector. +

+

The functions system.time + +and proc.time + +are available for timing (although the information available may be +limited on non-Unix-like platforms). +

+

Information from the operating system + +environment can be accessed and manipulated with +

+ + + + + + + + +
Sys.getenvOS environment variables +
Sys.putenv +
Sys.getlocaleSystem locale +
Sys.putlocale +
Sys.localeconv +
Sys.timeCurrent time +
Sys.timezoneTime zone +
+
+ + +

A uniform set of file access functions is provided on all platforms: +

+ + + + + + + + + + + + +
file.accessAscertain File Accessibility +
file.appendConcatenate files +
file.choosePrompt user for file name +
file.copyCopy files +
file.createCreate or truncate a files +
file.existsTest for existence +
file.infoMiscellaneous file information +
file.removeremove files +
file.renamerename files +
file.showDisplay a text file +
unlinkRemove files or directories. +
+
+ +

There are also functions for manipulating file names and paths in a +platform-independent way. +

+ + + + + +
basenameFile name without directory +
dirnameDirectory name +
file.pathConstruct path to file +
path.expandExpand ~ in Unix path +
+
+ +
+ + + +

7.2 Foreign language interfaces

+ + + + + +

See System and foreign language interfaces in Writing R +Extensions for the details of adding functionality to R via compiled +code. +

+

Functions .C and .Fortran provide a standard interface to +compiled code that has been linked into R, either at build time or +via dyn.load. They are primarily intended for compiled C and +FORTRAN code respectively, but the .C function can be used with +other languages which can generate C interfaces, for example C++. +

+

Functions .Call and .External provide interfaces which allow +compiled code (primarily compiled C code) to manipulate R objects. +

+
+ + + +

7.3 .Internal and .Primitive

+ + + +

The .Internal and .Primitive interfaces are used to call +C code compiled into R at build time. +See .Internal vs .Primitive in R Internals. +

+ +
+ + + +

8 Exception handling

+ +

The exception handling facilities in R are provided through two +mechanisms. Functions such as stop or warning can be +called directly or options such as "warn" can be used to control +the handling of problems. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: Exception handling   [Contents][Index]

+
+ +

8.1 stop

+ + +

A call to stop halts the evaluation of the current expression, +prints the message argument and returns execution to top-level. +

+
+ +
+

+Next: , Previous: , Up: Exception handling   [Contents][Index]

+
+ +

8.2 warning

+ + + +

The function warning takes a single argument that is a character +string. The behaviour of a call to warning depends on the value +of the option "warn". If "warn" is negative warnings are +ignored. If it is zero, they are stored and printed after the top-level +function has completed. If it is one, they are printed as they occur +and if it is 2 (or larger) warnings are turned into errors. +

+

If "warn" is zero (the default), a variable last.warning +is created and the messages associated with each call to warning +are stored, sequentially, in this vector. If there are fewer than 10 +warnings they are printed after the function has finished evaluating. +If there are more than 10 then a message indicating how many warnings +occurred is printed. In either case last.warning contains the +vector of messages, and warnings provides a way to access and +print it. +

+
+ +
+

+Next: , Previous: , Up: Exception handling   [Contents][Index]

+
+ +

8.3 on.exit

+ + +

A function can insert a call to on.exit at any point in the body +of a function. The effect of a call to on.exit is to store the +value of the body so that it will be executed when the function exits. +This allows the function to change some system parameters and to ensure +that they are reset to appropriate values when the function is finished. +The on.exit is guaranteed to be executed when the function exits +either directly or as the result of a warning. +

+

An error in the evaluation of the on.exit code causes an +immediate jump to top-level without further processing of the +on.exit code. +

+

on.exit takes a single argument which is an expression to be +evaluated when the function is exited. +

+ + + + + +
+ +
+

+Previous: , Up: Exception handling   [Contents][Index]

+
+ +

8.4 Error options

+ +

There are a number of options variables that can be used to +control how R handles errors and warnings. They are listed in the +table below. +

+
+
warn
+

Controls the printing of warnings. +

+
warning.expression
+

Sets an expression that is to be evaluated when a warning occurs. The +normal printing of warnings is suppressed if this option is set. +

+
error
+

Installs an expression that will be evaluated when an error occurs. +The normal printing of error messages and warning messages precedes the +evaluation of the expression. +

+
+ +

Expressions installed by options("error") are evaluated before +calls to on.exit are carried out. +

+

One can use options(error = expression(q("yes"))) to get R to +quit when an error has been signalled. In this case an error will cause +R to shut down and the global environment will be saved. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

9 Debugging

+ +

Debugging code has always been a bit of an art. R provides several +tools that help users find problems in their code. These tools halt +execution at particular points in the code and the current state of the +computation can be inspected. +

+

Most debugging takes place either through calls to browser or +debug. Both of these functions rely on the same internal +mechanism and both provide the user with a special prompt. Any command +can be typed at the prompt. The evaluation + +environment for the command +is the currently active environment. This allows you to examine the +current state of any variables etc. +

+

There are five special commands that R interprets differently. They +are, +

+
+
RET
+

Go to the next statement if the function is being debugged. Continue +execution if the browser was invoked. +

+
c
+
cont
+

Continue the execution. +

+
n
+

Execute the next statement in the function. This works from the browser +as well. +

+
where
+

Show the call stack +

+
Q
+

Halt execution and jump to the top-level immediately. +

+
+ + +

If there is a local variable with the same name as one of the special +commands listed above then its value can be accessed by using +get. A call to get with the name in quotes will retrieve +the value in the current + +environment. +

+

The debugger provides access only to interpreted expressions. If a +function calls a foreign language (such as C) then no access to the +statements in that language is provided. Execution will halt on the +next statement that is evaluated in R. A symbolic debugger such as +gdb can be used to debug compiled code. +

+ + + + + + + +
+ +
+

+Next: , Previous: , Up: Debugging   [Contents][Index]

+
+ +

9.1 browser

+ + +

A call to the function browser causes R to halt execution at +that point and to provide the user with a special prompt. Arguments to +browser are ignored. +

+
+
> foo <- function(s) {
++ c <- 3
++ browser()
++ }
+> foo(4)
+Called from: foo(4)
+Browse[1]> s
+[1] 4
+Browse[1]> get("c")
+[1] 3
+Browse[1]>
+
+ +
+ +
+

+Next: , Previous: , Up: Debugging   [Contents][Index]

+
+ +

9.2 debug/undebug

+ + + +

The debugger can be invoked on any function by using the command +debug(fun). Subsequently, each time that function is +evaluated the debugger is invoked. The debugger allows you to control +the evaluation of the statements in the body of the function. Before +each statement is executed the statement is printed out and a special +prompt provided. Any command can be given, those in the table above +have special meaning. +

+

Debugging is turned off by a call to undebug with the function as +an argument. +

+
+
> debug(mean.default)
+> mean(1:10)
+debugging in: mean.default(1:10)
+debug: {
+    if (na.rm)
+        x <- x[!is.na(x)]
+    trim <- trim[1]
+    n <- length(c(x, recursive = TRUE))
+    if (trim > 0) {
+        if (trim >= 0.5)
+            return(median(x, na.rm = FALSE))
+        lo <- floor(n * trim) + 1
+        hi <- n + 1 - lo
+        x <- sort(x, partial = unique(c(lo, hi)))[lo:hi]
+        n <- hi - lo + 1
+    }
+    sum(x)/n
+}
+Browse[1]>
+debug: if (na.rm) x <- x[!is.na(x)]
+Browse[1]>
+debug: trim <- trim[1]
+Browse[1]>
+debug: n <- length(c(x, recursive = TRUE))
+Browse[1]> c
+exiting from: mean.default(1:10)
+[1] 5.5
+
+ +
+ +
+

+Next: , Previous: , Up: Debugging   [Contents][Index]

+
+ +

9.3 trace/untrace

+ + + +

Another way of monitoring the behaviour of R is through the +trace mechanism. trace is called with a single argument +that is the name of the function you want to trace. The name does not +need to be quoted but for some functions you will need to quote the name +in order to avoid a syntax error. +

+

When trace has been invoked on a function then every time that +function is evaluated the call to it is printed out. This mechanism is +removed by calling untrace with the function as an argument. +

+
+
> trace("[<-")
+> x <- 1:10
+> x[3] <- 4
+trace: "[<-"(*tmp*, 3, value = 4)
+
+ +
+ +
+

+Previous: , Up: Debugging   [Contents][Index]

+
+ +

9.4 traceback

+ + +

When an error has caused a jump to top-level a special variable called +.Traceback is placed into the base environment. +.Traceback is a character vector with one entry for each function +call that was active at the time the error occurred. An examination of +.Traceback can be carried out by a call to traceback. +

+
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

10 Parser

+ + +

The parser is what converts the textual representation of R code into +an internal form which may then be passed to the R evaluator which +causes the specified instructions to be carried out. The internal form +is itself an R object and can be saved and otherwise manipulated +within the R system. +

+ + + + + + + + +
+ +
+

+Next: , Previous: , Up: Parser   [Contents][Index]

+
+ +

10.1 The parsing process

+ + + + + + + +
+ + + +

10.1.1 Modes of parsing

+ +

Parsing in R occurs in three different variants: +

+
    +
  • The read-eval-print loop +
  • Parsing of text files +
  • Parsing of character strings +
+ +

The read-eval-print loop forms the basic command line interface to R. +Textual input is read until a complete R expression is available. +Expressions may be split over several input lines. The primary prompt +(by default ‘> ’) indicates that the parser is ready for a new +expression, and a continuation prompt (by default ‘+ ’) indicates +that the parser expects the remainder of an incomplete expression. The +expression is converted to internal form during input and the parsed +expression is passed to the evaluator and the result is printed (unless +specifically made invisible). If the parser finds itself in a state +which is incompatible with the language syntax, a “Syntax Error” is +flagged and the parser resets itself and resumes input at the beginning +of the next input line. +

+

Text files can be parsed using the parse function. In +particular, this is done during execution of the source +function, which allows commands to be stored in an external file and +executed as if they had been typed at the keyboard. Note, though, that +the entire file is parsed and syntax checked before any evaluation takes +place. +

+

Character strings, or vectors thereof, can be parsed using the +text= argument to parse. The strings are treated exactly +as if they were the lines of an input file. +

+
+ +
+

+Next: , Previous: , Up: The parsing process   [Contents][Index]

+
+ +

10.1.2 Internal representation

+ + +

Parsed expressions are stored in an R object containing the parse +tree. A fuller description of such objects can be found in +Language objects and Expression objects. Briefly, every +elementary R expression is stored in + +function call form, as a list +with the first element containing the function name and the remainder +containing the arguments, which may in turn be further R expressions. +The list elements can be named, corresponding to tagged matching of +formal and actual arguments. Note that all R syntax elements +are treated in this way, e.g. the assignment x <- 1 is encoded +as "<-"(x, 1). +

+
+ + + +

10.1.3 Deparsing

+ +

Any R object can be converted to an R expression using +deparse. This is frequently used in connection with output of +results, e.g. for labeling plots. Notice that only objects of mode +"expression" can be expected to be unchanged by reparsing the +output of deparsing. For instance, the numeric vector 1:5 will +deparse as "c(1, 2, 3, 4, 5)", which will reparse as a call to +the function c. As far as possible, evaluating the deparsed and +reparsed expression gives the same result as evaluating the original, +but there are a couple of awkward exceptions, mostly involving +expressions that weren’t generated from a textual representation in the +first place. +

+
+ +
+

+Next: , Previous: , Up: Parser   [Contents][Index]

+
+ +

10.2 Comments

+ + +

Comments in R are ignored by the parser. Any text from a + +# character +to the end of the line is taken to be a comment, unless +the # character is inside a quoted string. For example, +

+
+
> x <- 1  # This is a comment...
+> y <- "  #... but this is not."
+
+ +
+ +
+

+Next: , Previous: , Up: Parser   [Contents][Index]

+
+ +

10.3 Tokens

+ +

Tokens are the elementary building blocks of a programming language. +They are recognised during lexical analysis which (conceptually, +at least) takes place prior to the syntactic analysis performed by the +parser itself. +

+ + + + + + + + + + + +
+ +
+

+Next: , Previous: , Up: Tokens   [Contents][Index]

+
+ +

10.3.1 Constants

+ +

There are five types of constants: integer, logical, numeric, complex and string. +

+

In addition, there are four special constants, NULL, NA, +Inf, and NaN. +

+

NULL is used to indicate the empty object. NA is used for +absent (“Not Available”) data values. Inf denotes infinity and +NaN is not-a-number in the IEEE floating point calculus +(results of the operations respectively 1/0 and 0/0, for +instance). +

+

Logical constants are either TRUE or FALSE. +

+

Numeric constants follow a similar syntax to that of the C language. +They consist of an integer part consisting of zero or more digits, +followed optionally by ‘.’ and a fractional part of zero or more +digits optionally followed by an exponent part consisting of an ‘E’ +or an ‘e’, an optional sign and a string of one or more digits. +Either the fractional or the decimal part can be empty, but not both at +once. +

+
+
Valid numeric constants: 1 10 0.1 .2 1e-7 1.2e+7
+
+ +

Numeric constants can also be hexadecimal, starting with ‘0x’ or +‘0x’ followed by zero or more digits, ‘a-f’ or ‘A-F’. +Hexadecimal floating point constants are supported using C99 syntax, e.g. +‘0x1.1p1’. +

+

There is now a separate class of integer constants. They are created +by using the qualifier L at the end of the number. For +example, 123L gives an integer value rather than a numeric +value. The suffix L can be used to qualify any non-complex +number with the intent of creating an integer. So it can be used with +numbers given by hexadecimal or scientific notation. However, if the +value is not a valid integer, a warning is emitted and the numeric +value created. The following shows examples of valid integer +constants, values which will generate a warning and give numeric +constants and syntax errors. +

+
+
Valid integer constants:  1L, 0x10L, 1000000L, 1e6L
+Valid numeric constants:  1.1L, 1e-3L, 0x1.1p-2
+Syntax error:  12iL 0x1.1
+
+ +

A warning is emitted for decimal values that contain an unnecessary +decimal point, e.g. 1.L. It is an error to have a decimal +point in a hexadecimal constant without the binary exponent. +

+

Note also that a preceding sign (+ or -) is treated as a +unary operator, not as part of the constant. +

+

Up-to-date information on the currently accepted formats can be found by +?NumericConstants. +

+

Complex constants have the form of a decimal numeric constant followed +by ‘i’. Notice that only purely imaginary numbers are actual +constants, other complex numbers are parsed a unary or binary operations +on numeric and imaginary numbers. +

+
+
Valid complex constants: 2i 4.1i 1e-2i
+
+ +

String constants are delimited by a pair of single (‘'’) or double +(‘"’) quotes and can contain all other printable characters. +Quotes and other special characters within strings are specified using +escape sequences: +

+
+
\'
+

single quote +

+
\"
+

double quote +

+
\n
+

newline +

+
\r
+

carriage return +

+
\t
+

tab character +

+
\b
+

backspace +

+
\a
+

bell +

+
\f
+

form feed +

+
\v
+

vertical tab +

+
\\
+

backslash itself +

+
\nnn
+

character with given octal code – sequences of one, two or three digits +in the range 0 ... 7 are accepted. +

+
\xnn
+

character with given hex code – sequences of one or two hex digits +(with entries 0 ... 9 A ... F a ... f). +

+
\unnnn \u{nnnn}
+

(where multibyte locales are supported, otherwise an error). +Unicode character with given hex code – sequences of up to four hex +digits. The character needs to be valid in the current locale. +

+
\Unnnnnnnn \U{nnnnnnnn}
+

(where multibyte locales are supported and not on Windows, otherwise an +error). Unicode character with given hex code – sequences of up to +eight hex digits. +

+
+ +

A single quote may also be embedded directly in a double-quote delimited +string and vice versa. +

+

As from R 2.8.0, a ‘nul’ (\0) is not allowed in a character +string, so using \0 in a string constant terminates the constant +(usually with a warning): further characters up to the closing quote are +scanned but ignored. +

+
+ +
+

+Next: , Previous: , Up: Tokens   [Contents][Index]

+
+ +

10.3.2 Identifiers

+ + +

Identifiers consist of a sequence of letters, digits, the period +(‘.’) and the underscore. They must not start with a digit or +an underscore, or with a period followed by a digit. +

+

The definition of a letter depends on the current locale: the precise +set of characters allowed is given by the C expression (isalnum(c) +|| c == ‘.’ || c == ‘_’) and will include accented letters in many +Western European locales. +

+

Notice that identifiers starting with a period are not by default listed +by the ls function and that ‘...’ and ‘..1’, +‘..2’, etc. are special. +

+

Notice also that objects can have names that are not identifiers. These +are generally accessed via get and assign, although they +can also be represented by text strings in some limited circumstances +when there is no ambiguity (e.g. "x" <- 1). As get and +assign are not restricted to names that are identifiers they do +not recognise subscripting operators or replacement functions. The +following pairs are not equivalent + + +

+
+ + + + +
x$a<-1assign("x$a",1)
x[[1]]get("x[[1]]")
names(x)<-nmassign("names(x)",nm)
+
+ +
+ +
+

+Next: , Previous: , Up: Tokens   [Contents][Index]

+
+ +

10.3.3 Reserved words

+ +

The following identifiers have a special meaning and cannot be used +for object names +

+
+
if else repeat while function for in next break
+TRUE FALSE NULL Inf NaN
+NA NA_integer_ NA_real_ NA_complex_ NA_character_
+... ..1 ..2 etc.
+
+ +
+ +
+

+Next: , Previous: , Up: Tokens   [Contents][Index]

+
+ +

10.3.4 Special operators

+ +

R allows user-defined infix operators. These have the form of a +string of characters delimited by the ‘%’ character. The string +can contain any printable character except ‘%’. The escape sequences +for strings do not apply here. +

+

Note that the following operators are predefined +

+
+
%% %*% %/% %in% %o% %x%
+
+ + + +
+ +
+

+Next: , Previous: , Up: Tokens   [Contents][Index]

+
+ +

10.3.5 Separators

+ +

Although not strictly tokens, stretches of whitespace characters +(spaces, tabs and formfeeds, on Windows and UTF-8 locales other Unicode +whitespace characters4) serve to delimit tokens in case of +ambiguity, (compare x<-5 and x < -5). +

+ +

Newlines have a function which is a combination of token separator and +expression terminator. If an expression can terminate at the end of +the line the parser will assume it does so, otherwise the newline is +treated as whitespace. Semicolons (‘;’) may be used to separate +elementary + +expressions on the same line. +

+ +

Special rules apply to the else keyword: inside a compound +expression, a newline before else is discarded, whereas at the +outermost level, the newline terminates the if construction and a +subsequent else causes a syntax error. This somewhat anomalous +behaviour occurs because R should be usable in interactive mode and +then it must decide whether the input expression is complete, +incomplete, or invalid as soon as the user presses RET. +

+

The comma (‘,’) is used to separate function arguments and multiple +indices. +

+
+ +
+

+Next: , Previous: , Up: Tokens   [Contents][Index]

+
+ +

10.3.6 Operator tokens

+ +

R uses the following operator tokens +

+
+ + + + + + + + +
+ - * / %% ^arithmetic
> >= < <= == !=relational
! & |logical
~model formulae
-> <-assignment
$list indexing
:sequence
+
+ +

(Several of the operators have different meaning inside model formulas) +

+
+ +
+

+Next: , Previous: , Up: Tokens   [Contents][Index]

+
+ +

10.3.7 Grouping

+ +

Ordinary parentheses—‘(’ and ‘)’—are used for explicit +grouping within expressions and to delimit the argument lists for +function definitions and function calls. +

+

Braces—‘{’ and ‘}’—delimit blocks of expressions in +function definitions, conditional expressions, and iterative constructs. +

+
+ +
+

+Previous: , Up: Tokens   [Contents][Index]

+
+ +

10.3.8 Indexing tokens

+ +

Indexing of arrays and vectors is performed using the single and double +brackets, ‘[]’ and ‘[[]]’. Also, indexing tagged lists +may be done using the ‘$’ operator. +

+ +
+ +
+

+Next: , Previous: , Up: Parser   [Contents][Index]

+
+ +

10.4 Expressions

+ +

An R program consists of a sequence of R expressions. An +expression can be a simple expression consisting of only a constant or +an identifier, or it can be a compound expression constructed from other +parts (which may themselves be expressions). +

+

The following sections detail the various syntactical constructs that +are available. +

+ + + + + + + + + + +
+ + + +

10.4.1 Function calls

+ + +

A function call takes the form of a function reference followed by a +comma-separated list of arguments within a set of parentheses. +

+
+
function_reference ( arg1, arg2, ...... , argn )
+
+ +

The function reference can be either +

    +
  • an identifier (the name of the function) +
  • a text string (ditto, but handy if the function has a name which is not +a valid identifier) +
  • an expression (which should evaluate to a function object) +
+ +

Each argument can be tagged (tag=expr), or just be a +simple expression. It can also be empty or it can be one of the special +tokens ‘...’, ‘..2’, etc. +

+

A tag can be an identifier or a text string. +

+

Examples: +

+
+
f(x)
+g(tag = value, , 5)
+"odd name"("strange tag" = 5, y)
+(function(x) x^2)(5)
+
+ +
+ + + +

10.4.2 Infix and prefix operators

+ +

The order of precedence (highest first) of the operators is +

+
+
::
+$ @
+^
+- +                (unary)
+:
+%xyz%
+* /
++ -                (binary)
+> >= < <= == !=
+!
+& &&
+| ||
+~                  (unary and binary)
+-> ->>
+=                  (as assignment)
+<- <<-
+
+

Note that : precedes binary +/-, but not ^. Hence, +1:3-1 is 0 1 2, but 1:2^3 is 1:8. +

+

The exponentiation operator ‘^’ and the + +left assignment plus minus operators +‘<- - = <<-’ group right to left, all other operators group left to +right. That is, 2 ^ 2 ^ 3 is 2 ^ 8, not 4 ^ 3, +whereas 1 - 1 - 1 is -1, not 1. +

+

Notice that the operators %% and %/% for integer +remainder and divide have higher precedence than multiply and divide. +

+

Although it is not strictly an operator, it also needs mentioning that +the ‘=’ sign is used for tagging arguments in +function calls and +for assigning default values in function definitions. +

+

The ‘$’ sign is in some sense an operator, but does not allow +arbitrary right hand sides and is discussed under Index constructions. It has higher precedence than any of the other +operators. +

+

The parsed form of a unary or binary operation is completely equivalent +to a function call with the operator as the function name and the +operands as the function arguments. +

+

Parentheses are recorded as equivalent to a unary operator, with name +"(", even in cases where the parentheses could be inferred from +operator precedence (e.g., a * (b + c)). +

+

Notice that the + +assignment symbols are operators just like the arithmetic, relational, +and logical ones. Any expression is allowed also on the target side of +an assignment, as far as the parser is concerned (2 + 2 <- 5 is a +valid expression as far as the parser is concerned. The evaluator will +object, though). Similar comments apply to the model formula operator. +

+
+ + + +

10.4.3 Index constructions

+ +

R has three indexing constructs, two of which are syntactically +similar although with somewhat different semantics: +

+
+
object [ arg1, ...... , argn ]
+object [[ arg1, ...... , argn ]]
+
+ + + +

The object can formally be any valid expression, but it is +understood to denote or evaluate to a subsettable object. The arguments +generally evaluate to numerical or character indices, but other kinds of +arguments are possible (notably drop = FALSE). +

+

Internally, these index constructs are stored as function calls with +function name "[" respectively "[[". +

+

The third index construction is +

+
+
object $ tag
+
+ + +

Here, object is as above, whereas tag is an identifier or a +text string. Internally, it is stored as a function call with name +"$" +

+ + +
+ + + +

10.4.4 Compound expressions

+ +

A compound expression is of the form +

+
+
{ expr1 ; expr2 ; ...... ; exprn }
+
+ +

The semicolons may be replaced by newlines. Internally, this is stored +as a function call with "{" as the function name and the +expressions as arguments. +

+
+ + + +

10.4.5 Flow control elements

+ +

R contains the following control structures as special syntactic +constructs +

+
+
if ( cond ) expr
+if ( cond ) expr1 else expr2
+while ( cond ) expr
+repeat expr
+for ( var in list ) expr
+
+ +

The expressions in these constructs will typically be compound +expressions. +

+

Within the loop constructs (while, repeat, for), +one may use break (to terminate the loop) and next (to +skip to the next iteration). +

+

Internally, the constructs are stored as function calls: +

+
+
"if"(cond, expr)
+"if"(cond, expr1, expr2)
+"while"(cond, expr)
+"repeat"(expr)
+"for"(var, list, expr)
+"break"()
+"next"()
+
+ +
+ +
+

+Previous: , Up: Expressions   [Contents][Index]

+
+ +

10.4.6 Function definitions

+ +

A + +function definition is of the form +

+
+
function ( arglist ) body
+
+ +

The function body is an expression, often a compound expression. The +arglist is a comma-separated list of items each of which can be an +identifier, or of the form ‘identifier = default’, or +the special token ‘...’. The default can be any valid +expression. +

+

Notice that function arguments unlike list tags, etc., cannot have +“strange names” given as text strings. +

+

Internally, a function definition is stored as a function call with +function name function and two arguments, the arglist and +the body. The arglist is stored as a tagged pairlist where +the tags are the argument names and the values are the default +expressions. +

+
+ +
+

+Previous: , Up: Parser   [Contents][Index]

+
+ +

10.5 Directives

+ + + +

The parser currently only supports one directive, #line. +This is similar to the C-preprocessor directive of the same name. The +syntax is +

+
+
#line nn [ "filename" ]
+
+ +

where nn is an integer line number, and the optional filename +(in required double quotes) names the source file. +

+

Unlike the C directive, #line must appear as the first five characters +on a line. As in C, nn and "filename" entries may be separated +from it by whitespace. And unlike C, any following text on the line will be +treated as a comment and ignored. +

+

This directive tells the parser that the following line should be assumed to +be line nn of file filename. (If the filename is not given, +it is assumed to be the same as for the previous directive.) This is not +typically used by users, but may be used by preprocessors so that +diagnostic messages refer to the original file. +

+ + +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Function and Variable Index

+ +
Jump to:   # +   +$ +   +. +   +[ +   +
+A +   +B +   +D +   +E +   +F +   +G +   +I +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +W +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

#
#: Comments

$
$: Indexing
$: Index constructions

.
.C: Foreign language interfaces
.Call: Foreign language interfaces
.External: Foreign language interfaces
.Fortran: Foreign language interfaces
.Internal: .Internal and .Primitive
.Primitive: .Internal and .Primitive

[
[: Indexing
[: Index constructions
[[: Indexing
[[: Index constructions

A
as.call: Language objects
as.character: Symbol objects
as.function: Function objects
as.list: Language objects
as.name: Symbol objects
assign: Identifiers
attr: Attributes
attr<-: Attributes
attributes: Attributes
attributes<-: Attributes

B
baseenv: Environment objects
basename: Operating system access
body: Function objects
body: Manipulation of functions
body<-: Manipulation of functions
break: Looping
browser: browser

D
debug: debug/undebug
dirname: Operating system access
do.call: Manipulation of function calls

E
emptyenv: Environment objects
environment: Function objects
environment: Manipulation of functions
environment<-: Manipulation of functions
eval: More on evaluation

F
file.access: Operating system access
file.append: Operating system access
file.choose: Operating system access
file.copy: Operating system access
file.create: Operating system access
file.exists: Operating system access
file.info: Operating system access
file.path: Operating system access
file.remove: Operating system access
file.rename: Operating system access
file.show: Operating system access
for: for
formals: Function objects
formals: Manipulation of functions
formals<-: Manipulation of functions

G
get: Identifiers

I
is.na: NA handling
is.nan: NA handling

M
match.arg: Argument matching
match.call: Argument matching
match.call: Manipulation of function calls
match.fun: Argument matching
missing: NA handling
mode: Objects

N
NA: NA handling
NA: Indexing by vectors
names: Names
names<-: Names
NaN: NA handling
new.env: Environment objects
next: Looping
NextMethod: NextMethod
NULL: NULL object

O
on.exit: on.exit

P
pairlist: Pairlist objects
path.expand: Operating system access
proc.time: Operating system access

Q
quote: Language objects

R
repeat: repeat

S
stop: stop
storage.mode: Objects
substitute: Substitutions
switch: switch
Sys.getenv: Operating system access
Sys.getlocale: Operating system access
Sys.localeconv: Operating system access
Sys.putenv: Operating system access
Sys.putlocale: Operating system access
Sys.time: Operating system access
Sys.timezone: Operating system access
system: Operating system access
system.time: Operating system access

T
trace: trace/untrace
traceback: traceback
typeof: Objects

U
undebug: debug/undebug
unlink: Operating system access
untrace: trace/untrace
UseMethod: UseMethod

W
warning: warning
warnings: warning
while: while

+
Jump to:   # +   +$ +   +. +   +[ +   +
+A +   +B +   +D +   +E +   +F +   +G +   +I +   +M +   +N +   +O +   +P +   +Q +   +R +   +S +   +T +   +U +   +W +   +
+ +
+ +
+

+Next: , Previous: , Up: Top   [Contents][Index]

+
+ +

Concept Index

+ +
Jump to:   # +   +. +   +
+A +   +B +   +C +   +E +   +F +   +I +   +M +   +N +   +O +   +P +   +S +   +T +   +V +   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Index Entry  Section

#
#line: Directives

.
.Internal: Builtin objects and special forms
.Primitive: Builtin objects and special forms

A
argument: Function objects
argument: Syntax and examples
argument, default values: Arguments
assignment: Function objects
assignment: Function calls
assignment: Operators
assignment: Subset assignment
assignment: Global environment
assignment: Argument evaluation
assignment: Scope
assignment: UseMethod
assignment: UseMethod
assignment: More on evaluation
assignment: Manipulation of function calls
assignment: Infix and prefix operators
assignment: Infix and prefix operators
atomic: Vector objects
attributes: Attributes

B
binding: Scope
binding: Scope

C
call: Language objects
call stack: Stacks
coercion: Objects
coercion: Symbol objects
coercion: Any-type
coercion: Classes
coercion: NA handling
comments: Comments
complex assignment: Subset assignment

E
environment: Function objects
environment: Function objects
environment: Promise objects
environment: Environment objects
environment: Control structures
environment: Global environment
environment: Lexical environment
environment: Stacks
environment: Search path
environment: Evaluation environment
environment: Argument evaluation
environment: Argument evaluation
environment: Scope
environment: UseMethod
environment: UseMethod
environment: More on evaluation
environment: Manipulation of functions
environment: Operating system access
environment: Debugging
environment: Debugging
environment, evaluation: Lexical environment
environment, evaluation: Lexical environment
environment, evaluation: Argument evaluation
evaluation: Stacks
evaluation: Evaluation environment
evaluation: Argument evaluation
evaluation: Scope
evaluation: Inheritance
evaluation: UseMethod
evaluation: More on evaluation
evaluation: Manipulation of function calls
evaluation, argument: Argument evaluation
evaluation, expression: Expression objects
evaluation, expression: Promise objects
evaluation, expression: Arguments
evaluation, lazy: Objects
evaluation, lazy: Substitutions
evaluation, lazy: Substitutions
evaluation, statement: Control structures
evaluation, symbol: Attributes
evaluation, symbol: Symbol lookup
evaluation, symbol: Scope
expression: Introduction
expression: Language objects
expression: Separators
expression object: Expression objects
expression object: Expression objects

F
frame: Lexical environment
function: Function objects
function: Function objects
function: Function objects
function: Builtin objects and special forms
function: Builtin objects and special forms
function: Promise objects
function: Dot-dot-dot
function: Function calls
function: Lexical environment
function: Lexical environment
function: Stacks
function: Writing functions
function: Syntax and examples
function: Syntax and examples
function: Arguments
function: Evaluation environment
function: Argument matching
function: Argument evaluation
function: Argument evaluation
function: Argument evaluation
function: Argument evaluation
function: Scope
function: Scope
function: Scope
function: Object-oriented programming
function: Definition
function: Manipulation of function calls
function: Manipulation of functions
function: Manipulation of functions
function: Internal representation
function: Function calls (expressions)
function: Function definitions
function argument: Promise objects
function argument: Dot-dot-dot
function arguments: Function calls
function invocation: Function calls
function, accessor: Attributes
function, anonymous: Syntax and examples
function, assignment: Function calls
function, generic: Object-oriented programming
function, generic: Definition
function, generic: Definition
function, generic: Definition
function, generic: Inheritance
function, generic: Method dispatching
function, generic: Writing methods
function, generic: Writing methods
function, internal: Argument evaluation
function, internal: Group methods
function, modeling: Factors

I
identifier: Identifiers
index: Vector objects
index: List objects
index: Indexing
index: Indexing by vectors
index: Indexing matrices and arrays
index: Indexing matrices and arrays

M
mode: Objects
mode: Vector objects
mode: Symbol objects
modeling function: Factors

N
name: Language objects
name: Symbol objects
name: Symbol lookup
name: Propagation of names
name: Scope of variables
name: Arguments
name: Argument matching
name: Argument evaluation
name: Method dispatching
name: NextMethod
name: Direct manipulation of language objects
name: Debugging
namespace: Search path

O
object: Objects
object: Objects
object: Symbol objects
object: Attributes
object: Method dispatching
object-oriented: Object-oriented programming
object-oriented: Definition

P
parsing: Language objects
parsing: Symbol objects
parsing: Evaluation of expressions
parsing: Computing on the language
parsing: Direct manipulation of language objects
parsing: Substitutions
parsing: Parser
parsing: Internal representation
partial matching: Indexing by vectors
promise: Promise objects

S
scope: Scope of variables
scope: Stacks
scope: Scope
scope: Scope
scope: Scope
scope: More on evaluation
search path: Search path
statement: Language objects
symbol: Symbol objects
symbol: Symbol objects
symbol: Symbol lookup
symbol: Scope
symbol: Substitutions
symbol: Manipulation of function calls

T
token: Expression objects
type: Objects
type: Objects
type: Basic types
type: Vector objects
type: Names
type: NA handling

V
value: Symbol lookup
variable: Objects
vector: Vector objects
vector: Dimensions
vector: Operators

+
Jump to:   # +   +. +   +
+A +   +B +   +C +   +E +   +F +   +I +   +M +   +N +   +O +   +P +   +S +   +T +   +V +   +
+ +
+ +
+

+Previous: , Up: Top   [Contents][Index]

+
+ +

Appendix A References

+ +

Richard A. Becker, John M. Chambers and Allan R. Wilks (1988), +The New S Language. Chapman & Hall, New York. +This book is often called the “Blue Book”. +

+
+
+

Footnotes

+ +

(1)

+

actually two, but this draft +manual predates the methods package.

+

(2)

+

Evaluation always takes place in an + +environment. +See Scope of variables for more details.

+

(3)

+

Looping is the repeated evaluation of a statement or +block of statements.

+

(4)

+

such as U+A0, non-breaking space, +and U+3000, ideographic space.

+
+
+ + + + + diff --git a/R.spec b/R.spec index 1782c3e..55020fc 100644 --- a/R.spec +++ b/R.spec @@ -56,12 +56,26 @@ Name: R Version: 3.2.3 -Release: 3%{?dist} +Release: 4%{?dist} Summary: A language for data analysis and graphics URL: http://www.r-project.org Source0: ftp://cran.r-project.org/pub/R/src/base/R-3/R-%{version}.tar.gz Source1: macros.R Source2: R-make-search-index.sh +%if %{texi2any} +# If we have texi2any 5.1+, we can generate the docs on the fly. +# If not, we're building for a very old target (RHEL 6 or older) +%else +# In this case, we need to use pre-built manuals. +# NOTE: These need to be updated for every new version. +Source100: https://cran.r-project.org/doc/manuals/r-release/R-intro.html +Source101: https://cran.r-project.org/doc/manuals/r-release/R-data.html +Source102: https://cran.r-project.org/doc/manuals/r-release/R-admin.html +Source103: https://cran.r-project.org/doc/manuals/r-release/R-exts.html +Source104: https://cran.r-project.org/doc/manuals/r-release/R-lang.html +Source105: https://cran.r-project.org/doc/manuals/r-release/R-ints.html +Source106: https://cran.r-project.org/doc/FAQ/R-FAQ.html +%endif Patch0: 0001-Disable-backing-store-in-X11-window.patch Patch1: 0001-Wait-for-MapNotify-event-while-intializing-window.patch License: GPLv2+ @@ -528,6 +542,13 @@ pushd $RPM_BUILD_ROOT%{_datadir}/texmf/tex/latex ln -s ../../../R/texmf/tex/latex R popd +%if %{texi2any} +# Do not need to copy files... +%else +# COPY THAT FLOPPY +cp -a %{SOURCE100} %{SOURCE101} %{SOURCE102} %{SOURCE103} %{SOURCE104} %{SOURCE105} %{SOURCE106} %{buildroot}%{?_pkgdocdir}%{!?_pkgdocdir:%{_docdir}/%{name}-%{version}}/manual/ +%endif + %check # Needed by tests/ok-error.R, which will smash the stack on PPC64. This is the purpose of the test. ulimit -s 16384 @@ -970,6 +991,9 @@ R CMD javareconf \ %postun -n libRmath -p /sbin/ldconfig %changelog +* Tue Jan 26 2016 Tom Callaway - 3.2.3-4 +- if texi2any is set to 0, then copy in prebuilt html manuals (RHEL 5 & 6 only) + * Tue Jan 26 2016 Tom Callaway - 3.2.3-3 - use global instead of define