was read the article
array:22 [ "pii" => "S0016716915000033" "issn" => "00167169" "doi" => "10.1016/j.gi.2015.04.002" "estado" => "S300" "fechaPublicacion" => "2015-01-01" "aid" => "2" "copyrightAnyo" => "2015" "documento" => "article" "licencia" => "http://creativecommons.org/licenses/by-nc-nd/4.0/" "subdocumento" => "fla" "cita" => "Geofisica Internacional. 2015;54:31-48" "abierto" => array:3 [ "ES" => true "ES2" => true "LATM" => true ] "gratuito" => true "lecturas" => array:2 [ "total" => 1567 "formatos" => array:3 [ "EPUB" => 41 "HTML" => 1044 "PDF" => 482 ] ] "itemSiguiente" => array:17 [ "pii" => "S0016716915000082" "issn" => "00167169" "doi" => "10.1016/j.gi.2015.04.007" "estado" => "S300" "fechaPublicacion" => "2015-01-01" "aid" => "7" "documento" => "article" "licencia" => "http://creativecommons.org/licenses/by-nc-nd/4.0/" "subdocumento" => "fla" "cita" => "Geofisica Internacional. 2015;54:49-65" "abierto" => array:3 [ "ES" => true "ES2" => true "LATM" => true ] "gratuito" => true "lecturas" => array:2 [ "total" => 1380 "formatos" => array:3 [ "EPUB" => 38 "HTML" => 924 "PDF" => 418 ] ] "en" => array:11 [ "idiomaDefecto" => true "titulo" => "Estimation of multiple density-depth parameters from gravity inversion: Application to detached hanging wall systems of strike limited listric fault morphologies" "tienePdf" => "en" "tieneTextoCompleto" => "en" "tieneResumen" => array:2 [ 0 => "es" 1 => "en" ] "paginas" => array:1 [ 0 => array:2 [ "paginaInicial" => "49" "paginaFinal" => "65" ] ] "contieneResumen" => array:2 [ "es" => true "en" => true ] "contieneTextoCompleto" => array:1 [ "en" => true ] "contienePdf" => array:1 [ "en" => true ] "resumenGrafico" => array:2 [ "original" => 0 "multimedia" => array:7 [ "identificador" => "fig0050" "etiqueta" => "Figure 10" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr10.jpeg" "Alto" => 3226 "Ancho" => 2569 "Tamanyo" => 418716 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0060" class="elsevierStyleSimplePara elsevierViewall">(a) Error analysis between the observed and modeled gravity anomalies across the Aswaraopet master fault, Chintalpudi subbasin, India, (b) changes in misfit, coefficients of a 2nd degree polynomial, and depths to various density interfaces against the iteration number.</p>" ] ] ] "autores" => array:1 [ 0 => array:2 [ "autoresLista" => "V. Chakravarthi, M. Pramod Kumar" "autores" => array:2 [ 0 => array:2 [ "nombre" => "V." "apellidos" => "Chakravarthi" ] 1 => array:2 [ "nombre" => "M. Pramod" "apellidos" => "Kumar" ] ] ] ] ] "idiomaDefecto" => "en" "EPUB" => "https://multimedia.elsevier.es/PublicationsMultimediaV1/item/epub/S0016716915000082?idApp=UINPBA00004N" "url" => "/00167169/0000005400000001/v1_201505130244/S0016716915000082/v1_201505130244/en/main.assets" ] "itemAnterior" => array:17 [ "pii" => "S0016716915000021" "issn" => "00167169" "doi" => "10.1016/j.gi.2015.04.001" "estado" => "S300" "fechaPublicacion" => "2015-01-01" "aid" => "1" "documento" => "article" "licencia" => "http://creativecommons.org/licenses/by-nc-nd/4.0/" "subdocumento" => "fla" "cita" => "Geofisica Internacional. 2015;54:21-30" "abierto" => array:3 [ "ES" => true "ES2" => true "LATM" => true ] "gratuito" => true "lecturas" => array:2 [ "total" => 1407 "formatos" => array:3 [ "EPUB" => 35 "HTML" => 943 "PDF" => 429 ] ] "en" => array:11 [ "idiomaDefecto" => true "titulo" => "Dynamics of internal waves generated by mountain breeze in Alchichica Crater Lake, Mexico" "tienePdf" => "en" "tieneTextoCompleto" => "en" "tieneResumen" => array:2 [ 0 => "es" 1 => "en" ] "paginas" => array:1 [ 0 => array:2 [ "paginaInicial" => "21" "paginaFinal" => "30" ] ] "contieneResumen" => array:2 [ "es" => true "en" => true ] "contieneTextoCompleto" => array:1 [ "en" => true ] "contienePdf" => array:1 [ "en" => true ] "resumenGrafico" => array:2 [ "original" => 0 "multimedia" => array:7 [ "identificador" => "fig0020" "etiqueta" => "Figure 4" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr4.jpeg" "Alto" => 1715 "Ancho" => 1966 "Tamanyo" => 324564 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0030" class="elsevierStyleSimplePara elsevierViewall">Horizontal wind speed components and vertical deviations of water layers at mooring 1 (horizon 8 m) and mooring 2 (horizon 12.5 m). The numbers of the curves correspond to the buoys 1 and 2. Daily average wind velocity from January to May 2002 in Lake Alchichica is presented in the rectangle in the superior right corner.</p>" ] ] ] "autores" => array:1 [ 0 => array:2 [ "autoresLista" => "Anatoliy Filonov, Iryna Tereshchenko, Javier Alcocer, Cesar Monzón" "autores" => array:4 [ 0 => array:2 [ "nombre" => "Anatoliy" "apellidos" => "Filonov" ] 1 => array:2 [ "nombre" => "Iryna" "apellidos" => "Tereshchenko" ] 2 => array:2 [ "nombre" => "Javier" "apellidos" => "Alcocer" ] 3 => array:2 [ "nombre" => "Cesar" "apellidos" => "Monzón" ] ] ] ] ] "idiomaDefecto" => "en" "EPUB" => "https://multimedia.elsevier.es/PublicationsMultimediaV1/item/epub/S0016716915000021?idApp=UINPBA00004N" "url" => "/00167169/0000005400000001/v1_201505130244/S0016716915000021/v1_201505130244/en/main.assets" ] "en" => array:20 [ "idiomaDefecto" => true "titulo" => "Forward modeling of gravitational fields on hybrid multi-threaded cluster" "tieneTextoCompleto" => true "paginas" => array:1 [ 0 => array:2 [ "paginaInicial" => "31" "paginaFinal" => "48" ] ] "autores" => array:1 [ 0 => array:4 [ "autoresLista" => "Carlos Couder-Castañeda, José Carlos Ortiz-Alemán, Mauricio Gabriel Orozco-del-Castillo, Mauricio Nava-Flores" "autores" => array:4 [ 0 => array:4 [ "nombre" => "Carlos" "apellidos" => "Couder-Castañeda" "email" => array:1 [ 0 => "ccouder@esfm.ipn.mx" ] "referencia" => array:2 [ 0 => array:2 [ "etiqueta" => "<span class="elsevierStyleSup">a</span>" "identificador" => "aff0005" ] 1 => array:2 [ "etiqueta" => "<span class="elsevierStyleSup">*</span>" "identificador" => "cor0005" ] ] ] 1 => array:3 [ "nombre" => "José Carlos" "apellidos" => "Ortiz-Alemán" "referencia" => array:1 [ 0 => array:2 [ "etiqueta" => "<span class="elsevierStyleSup">a</span>" "identificador" => "aff0005" ] ] ] 2 => array:3 [ "nombre" => "Mauricio Gabriel" "apellidos" => "Orozco-del-Castillo" "referencia" => array:1 [ 0 => array:2 [ "etiqueta" => "<span class="elsevierStyleSup">a</span>" "identificador" => "aff0005" ] ] ] 3 => array:3 [ "nombre" => "Mauricio" "apellidos" => "Nava-Flores" "referencia" => array:1 [ 0 => array:2 [ "etiqueta" => "<span class="elsevierStyleSup">b</span>" "identificador" => "aff0010" ] ] ] ] "afiliaciones" => array:2 [ 0 => array:3 [ "entidad" => "Mexican Petroleum Institute, Eje Central Lázaro Cárdenas, 152, San Bartolo Atepehuacan, Gustavo A. Madero 07730, Ciudad de México" "etiqueta" => "a" "identificador" => "aff0005" ] 1 => array:3 [ "entidad" => "División de Ingeniería en Ciencias de la Tierra Facultad de Ingeniería, Universidad Nacional Autónoma de México Ciudad Universitaria, Delegación Coyoacán, 04510 México D.F., México" "etiqueta" => "b" "identificador" => "aff0010" ] ] "correspondencia" => array:1 [ 0 => array:3 [ "identificador" => "cor0005" "etiqueta" => "⁎" "correspondencia" => "Corresponding author." ] ] ] ] "resumenGrafico" => array:2 [ "original" => 0 "multimedia" => array:7 [ "identificador" => "fig0030" "etiqueta" => "Figure 6" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr6.jpeg" "Alto" => 2285 "Ancho" => 1186 "Tamanyo" => 319253 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0040" class="elsevierStyleSimplePara elsevierViewall">Partitioning by prisms.</p>" ] ] ] "textoCompleto" => "<span class="elsevierStyleSections"><span id="sec0005" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0025">Introduction</span><p id="par0005" class="elsevierStylePara elsevierViewall">The shared memory architecture is becoming more common every day in the highperformance computing market. With the hardware technology advances allowing us to have a great number of cores with access to the same memory locations, nowadays it is not that expensive to have systems with forty or sixty cores using shared memory. OpenMP is now a standard for symmetric multiprocessing systems (SMP) (even can be used transparently in the Xeon Phi architecture (<a class="elsevierStyleCrossRef" href="#bib0020">Calvin <span class="elsevierStyleItalic">et al</span>., 2013</a>)) sustained by a combination of function and compiler directives, a standard for the symmetric multiprocessing (SMP) systems (<a class="elsevierStyleCrossRefs" href="#bib0040">Dagum and Menon, 1998; Curtis-Maury <span class="elsevierStyleItalic">et al</span>., 2008</a>). OpenMP has proven to be a powerful tool for SMP due to several reasons: it is highly portable; it allows fine and medium granularity, each thread can access to the same global memory; and has their own private memory, and it also has a greater level of abstraction than MPI model (<a class="elsevierStyleCrossRef" href="#bib0015">Brunst and Mohr, 2008</a>).</p><p id="par0010" class="elsevierStylePara elsevierViewall">MPI is a library supported on the Same Program Multiple Data (SPMD) model and on the message passing model, with an explicit control of the parallelism. The processes can only read and write in their respective local memories and the data in these memories is transferred through calls to functions or procedures which implement the message passing model. Among the principal characteristics of MPI are that it can run in architectures of shared and distributed memory, is convenient for medium to coarse granularity and that employment is widely extended, making it extremely portable among platforms (<a class="elsevierStyleCrossRef" href="#bib0065">Krpic <span class="elsevierStyleItalic">et al</span>., 2012</a>).</p><p id="par0015" class="elsevierStylePara elsevierViewall">Using a hybrid programming model we can take advantage of the benefits of two programming models OpenMP and MPI. MPI is normally used to control the parallelism among cluster nodes, while OpenMP is applied in the creation of threads of fine granularity tasks within each node. Most applications developed in hybrid model involves a hierarchical model: MPI is for the higher level and OpenMP for the lower one (<a class="elsevierStyleCrossRef" href="#bib0090">Smith, 2000</a>).</p><p id="par0020" class="elsevierStylePara elsevierViewall">One of the potential benefits of using hybrid model programming consists of getting rid of the barrier of scaling that each model has. Generally, in MPI the scaling is limited by the communications cost, because an application is affected by the overload of communication when the number of processes is increased. In OpenMP the performance of an application is affected by cache coherence problems and access to shared memory which may lead to bottleneck issues between the execution threads when trying to access memory. By mixing these methodologies of parallel programming (OpenMP and MPI), we can obtain a more diverse granularity of the application and therefore a better performance than by using each one on its own.</p><p id="par0025" class="elsevierStylePara elsevierViewall">There are different applications which use this programming paradigm: OpenMP with MPI. For example, in the solution of sparse linear systems (<a class="elsevierStyleCrossRef" href="#bib0075">Mitin <span class="elsevierStyleItalic">et al</span>., 2012</a>), in graphcoloring algorithms (<a class="elsevierStyleCrossRef" href="#bib0085">Sariyuce <span class="elsevierStyleItalic">et al</span>., 2012</a>), in some models of fluid dynamics (<a class="elsevierStyleCrossRefs" href="#bib0005">Amritkar <span class="elsevierStyleItalic">et al</span>., 2012; Couder-Castañeda, 2009</a>) and finite element methods (<a class="elsevierStyleCrossRef" href="#bib0010">Boehmer <span class="elsevierStyleItalic">et al</span>., 2012</a>), in the simulation of turbulent fluids (<a class="elsevierStyleCrossRef" href="#bib0060">Jagannathan and Donzis, 2012</a>), even in the simulation of combustion chambers (Környei, 2012) and the implementation of neural networks (<a class="elsevierStyleCrossRef" href="#bib0050">Gonzalez <span class="elsevierStyleItalic">et al</span>., 2012</a>). As can be observed, there are numerous computational implementations using OpenMP with MPI, nevertheless, this type of design is supported on a natural decomposition of the domain (<a class="elsevierStyleCrossRef" href="#bib0025">CarrilloLedesma <span class="elsevierStyleItalic">et al</span>., 2013</a>), based on data. For our particular problem, each one of the processing units accesses all of the computational domain points.</p><p id="par0030" class="elsevierStylePara elsevierViewall">In <a class="elsevierStyleCrossRef" href="#fig0005">Figure 1</a> is depicted a domain decomposition, where each task (process or thread) is given some data subset on which to work. This domain decomposition is commonly used for example in finite differences problems where computational domains divided disjointly among the different tasks.</p><elsevierMultimedia ident="fig0005"></elsevierMultimedia><p id="par0035" class="elsevierStylePara elsevierViewall">On the other hand, in the direct conformation of gravimetric data, an initial model for the source body is constructed from geologicalgeophysical information. The anomaly of such model is calculated and compared to the observed anomaly, after which the parameters are adapted to improve the adjustment between them. These three steps that arrange the model properties — <span class="elsevierStyleItalic">anomalies calculation, comparison</span> and <span class="elsevierStyleItalic">adjustment</span> — are repeated up to the observed and calculated anomalies are similar enough.</p><p id="par0040" class="elsevierStylePara elsevierViewall">A mass volume can be approximated by a set of rectangular prisms; if chosen sufficiently small, each prism can be considered to have a constant density. Because of the superposition principle, the gravitational anomaly of a body can be approximated at anypoint by summing the effects of all the prisms over that point. Even though this methodology appears simple (by reducing the size of the prisms to better adjust the source body), computing time is considerably increased. There are other approaching methods of the gravitational anomaly that can simplify the required computation (mass points or tesseroids approximations), however, they may complicate the construction of the geological model (<a class="elsevierStyleCrossRef" href="#bib0055">Heck and Seitz, 2007</a>).</p></span><span id="sec0010" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0030">Application design</span><p id="par0045" class="elsevierStylePara elsevierViewall">The application consists of calculating the gravimetric anomaly produced by a rectangular prismatic body with constant density with respect to a group of observation points (see <a class="elsevierStyleCrossRef" href="#fig0010">Figure 2</a>). The set of prisms is known as an ensemble of prisms, which is not necessarily regular. A set of irregular prisms can be configured as long as the prisms are not superimposed. Because the gravitational field complies with the superposition principle with respect to the observation points, if <span class="elsevierStyleItalic">f</span> is the calculated response at a point (<span class="elsevierStyleItalic">x, y</span>), then the observed response at the point <span class="elsevierStyleItalic">f</span> (<span class="elsevierStyleItalic">x, y</span>) is given by:<elsevierMultimedia ident="eq0005"></elsevierMultimedia></p><elsevierMultimedia ident="fig0010"></elsevierMultimedia><p id="par0050" class="elsevierStylePara elsevierViewall">where <span class="elsevierStyleItalic">M</span> is the number of total prisms and <span class="elsevierStyleItalic">ρ</span> is the density of the prism.</p><p id="par0055" class="elsevierStylePara elsevierViewall">It is well known that the function that calculates the anomaly for a given prism from an observation point is written as follows (<a class="elsevierStyleCrossRef" href="#bib0080">Nagy <span class="elsevierStyleItalic">et al</span>., 2000</a>):<elsevierMultimedia ident="eq0010"></elsevierMultimedia></p><p id="par0060" class="elsevierStylePara elsevierViewall">where (<span class="elsevierStyleItalic">x</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">l</span></span><span class="elsevierStyleItalic">, y</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">l</span></span><span class="elsevierStyleItalic">, z</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">l</span></span>) is the top left vertex of the prism, (<span class="elsevierStyleItalic">x</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">r</span></span><span class="elsevierStyleItalic">, y</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">r</span></span><span class="elsevierStyleItalic">, z</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">r</span></span>) is the bottom right prism and (<span class="elsevierStyleItalic">x</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">p</span></span><span class="elsevierStyleItalic">, y</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">p</span></span><span class="elsevierStyleItalic">, z</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">p</span></span>) is the observation point and ρ the density, as shown in <a class="elsevierStyleCrossRef" href="#fig0015">Figure 3</a>.</p><elsevierMultimedia ident="fig0015"></elsevierMultimedia><p id="par0065" class="elsevierStylePara elsevierViewall">The aforementioned is a large scale problem since, for example, a synthetic problem conformed by a set of prisms of 300 × 300 × 150 = 13,500,000 elements, against an observation grid of 100 × 100 = 10,000 points, results in the calculation of 135,000,000,000 integrals or differentials to solve the entire problem. The formulations we used are included in appendix A.</p><p id="par0070" class="elsevierStylePara elsevierViewall">Computing time reduction in a numerical simulation is of great importance to diminish research costs. A simulation which lasts a week is likely to be costly, not only because the machine time is expensive, but also because it prohibits the quick acquisition of results to make modifications and predictions.</p><p id="par0075" class="elsevierStylePara elsevierViewall">In many projects to be parallelized, several times the serial algorithm does not show a natural decomposition which allows easily porting it to a parallel environment, or the trivial decomposition does not yield good performance results. For such reasons it is convenient to use a hybrid programming methodology, as the one developed and presented in this paper. This methodology provides an adequate programming design to obtain a superior performance.</p><p id="par0080" class="elsevierStylePara elsevierViewall">To develop a parallel program it is fundamental to search for the finest granularity, as in the methodology proposed by Foster (<a class="elsevierStyleCrossRef" href="#bib0045">Foster, 1995</a>). In this case it is possible to parallelize by prisms or by observation points. One of the requirements of the design is that it must be scalable, therefore the use of hybrid systems is quite appropriate; these systems are the most commonly used nowadays. Following Foster's methodology, it is necessary to begin with the finest granularity, in this case corresponds to OpenMP because it is in the lowest level. Subsequently the implementation follows with MPI, due to its coarse granularity.</p><span id="sec0015" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0035">Implementation in OpenMP</span><p id="par0085" class="elsevierStylePara elsevierViewall">We started our design with OpenMP because it handles shared memory and it is also the finest granularity. First we partitioned the domain into prisms, and for each prism we parallelized the calculation by observation points, as shown in <a class="elsevierStyleCrossRef" href="#fig0020">Figure 4</a>.</p><elsevierMultimedia ident="fig0020"></elsevierMultimedia><p id="par0090" class="elsevierStylePara elsevierViewall">This parallelization by observation points is trivial and does not offer a great design challenge, since we simply partition the calculation with respect to the observation grid for each prism (see the pseudo-code 1). However, this scheme has several drawbacks. One of them is that the performance is not optimal since the number of prisms is much greater than the number of observation points. In other words, this partitioning is efficient as long as there are not too many threads working upon the observation grid, thus avoiding a bottleneck issue as a consequence of the threads works in the same memory allocation. Maybe the worst drawback lies in the fact that the parallel environment is created and closed, i.e. for each prism, a function which parallely calculates the anomalies is executed, but such environment is closed once the execution is over, and reopened for the following prism, which results in an unnecessary overload and therefore decreases the performance. <a class="elsevierStyleCrossRef" href="#fig0025">Figure 5</a></p><elsevierMultimedia ident="fig0025"></elsevierMultimedia><p id="par0095" class="elsevierStylePara elsevierViewall"><span class="elsevierStyleBold">Listing 1.</span> Parallelization by observation points</p><p id="par0100" class="elsevierStylePara elsevierViewall">For each prism from 1 to M</p><p id="par0105" class="elsevierStylePara elsevierViewall">!$OMP PARRALLEL DO COLLAPSE(2)</p><p id="par0110" class="elsevierStylePara elsevierViewall">For each j from 1 to Ny</p><p id="par0115" class="elsevierStylePara elsevierViewall">For each i from 1 to Nx</p><p id="par0120" class="elsevierStylePara elsevierViewall">G(i,j)=Gz(parameters)+G(i,j)</p><p id="par0125" class="elsevierStylePara elsevierViewall">End For</p><p id="par0130" class="elsevierStylePara elsevierViewall">End For</p><p id="par0135" class="elsevierStylePara elsevierViewall">!$OMP END PARALLEL DO</p><p id="par0140" class="elsevierStylePara elsevierViewall">End For</p><p id="par0145" class="elsevierStylePara elsevierViewall"><span class="elsevierStyleBold">Listing 2.</span> Parallelization by observation points</p><p id="par0150" class="elsevierStylePara elsevierViewall">!$OMP PARRALLEL DO</p><p id="par0155" class="elsevierStylePara elsevierViewall">For each prism from 1 to M</p><p id="par0160" class="elsevierStylePara elsevierViewall">For each j from 1 to Ny</p><p id="par0165" class="elsevierStylePara elsevierViewall">For each i from 1 to Nx</p><p id="par0170" class="elsevierStylePara elsevierViewall">G(i,j)=Gz(parameters)+G(Thread, i,j)</p><p id="par0175" class="elsevierStylePara elsevierViewall">End For</p><p id="par0180" class="elsevierStylePara elsevierViewall">End For</p><p id="par0185" class="elsevierStylePara elsevierViewall">End For</p><p id="par0190" class="elsevierStylePara elsevierViewall">!$OMP END PARALLEL DO</p><p id="par0195" class="elsevierStylePara elsevierViewall">The other parallelization option is to use prisms i.e., making the threads divide the work per number of prisms (see pseudo-code 2). To avoid the coherence problems of the cache it is necessary to create a different memory space for each execution thread, because it is not feasible to create a single memory space for an unique observation grid, shared by all the threads.</p><p id="par0200" class="elsevierStylePara elsevierViewall">As observed in <a class="elsevierStyleCrossRef" href="#fig0030">Figure 6</a>, it is required to create an observation grid for each execution thread to avoid memory consistency problems. Bottleneck memory access issues are avoided since every thread writes in a different direction of the memory space. If only one grid were to be used, there would be access problems to the shared grid, which would create numerical inconsistencies.</p><elsevierMultimedia ident="fig0030"></elsevierMultimedia><p id="par0205" class="elsevierStylePara elsevierViewall">One of the characteristics of OpenMP is that the computing is distributed in an implicit manner, therefore the partitioning of the M prisms, which composes the problem, is done automatically using a balancing algorithm included in OpenMP. In this case the decision is left to the compiler, which is optimum 99% of the cases (<a class="elsevierStyleCrossRef" href="#bib0100">Zhang <span class="elsevierStyleItalic">et al</span>., 2004</a>).</p></span><span id="sec0020" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0040">OpenMP+MPI Implementation</span><p id="par0210" class="elsevierStylePara elsevierViewall">One of the advantages of the prism parallelization is that it is easier to implement in MPI, producing tasks of coarse granularity using the same design previously applied in OpenMP. Having the observation grid partitioned would result in a more complicated and less efficient design using MPI. Since the parallelization in MPI is explicit, we need to manually distribute the number of prisms through a modular expression. If <span class="elsevierStyleItalic">M</span> is the number of prisms to calculate and <span class="elsevierStyleItalic">p</span> is the MPI process number (numbered from 0 to <span class="elsevierStyleItalic">p</span>−1), then for each process <span class="elsevierStyleItalic">p</span> we define the beginning and end of the prisms to be processed by <span class="elsevierStyleItalic">p</span> as <span class="elsevierStyleItalic">p</span><span class="elsevierStyleInf">start</span> and <span class="elsevierStyleItalic">p</span><span class="elsevierStyleInf">end</span>, respectively. We define theinteger <span class="elsevierStyleItalic">s</span> as the quotient of the number of prisms <span class="elsevierStyleItalic">M</span> between the total number of processes <span class="elsevierStyleItalic">p</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">n</span></span> and <span class="elsevierStyleItalic">r</span> as the remainder, the procedure to determine <span class="elsevierStyleItalic">p</span><span class="elsevierStyleInf">start</span> and <span class="elsevierStyleItalic">p</span><span class="elsevierStyleInf">end</span> proceed as follows:<elsevierMultimedia ident="eq0015"></elsevierMultimedia><elsevierMultimedia ident="eq0020"></elsevierMultimedia></p><p id="par0215" class="elsevierStylePara elsevierViewall">Therefore<elsevierMultimedia ident="eq0025"></elsevierMultimedia></p><p id="par0220" class="elsevierStylePara elsevierViewall">and<elsevierMultimedia ident="eq0030"></elsevierMultimedia></p><p id="par0225" class="elsevierStylePara elsevierViewall">If <span class="elsevierStyleItalic">r ≠</span> 0 and <span class="elsevierStyleItalic">p</span> < <span class="elsevierStyleItalic">r</span>, then we adjust as:<elsevierMultimedia ident="eq0035"></elsevierMultimedia></p><p id="par0230" class="elsevierStylePara elsevierViewall">and<elsevierMultimedia ident="eq0040"></elsevierMultimedia></p><p id="par0235" class="elsevierStylePara elsevierViewall">If <span class="elsevierStyleItalic">r ≠</span> 0 and <span class="elsevierStyleItalic">p</span> ≥ <span class="elsevierStyleItalic">r</span>, then:<elsevierMultimedia ident="eq0045"></elsevierMultimedia></p><p id="par0240" class="elsevierStylePara elsevierViewall">and<elsevierMultimedia ident="eq0050"></elsevierMultimedia></p><p id="par0245" class="elsevierStylePara elsevierViewall">This way we can distribute the number of prisms <span class="elsevierStyleItalic">M</span> over <span class="elsevierStyleItalic">p</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">n</span></span> processes in a balanced manner; once this distribution is made, we can use the OpenMP implementation in each node. In other words, we occupy MPI to distribute the number of prisms in each node, and at the same time in each node we employ OpenMP to reduce the number of MPI processes, reducing communication time.</p><p id="par0250" class="elsevierStylePara elsevierViewall">In consequence, the application is partitioned by the number of prisms M, both in OpenMP as in MPI. Another option is to parallelize by prisms in MPI and by observation points in OpenMP. Even though this is a viable option, it is not very scalable due the drawback discussed in the previous subsection.</p><p id="par0255" class="elsevierStylePara elsevierViewall">Basically the design consists of allocating an observation grid per execution thread and a global observation grid in the master thread per computing node, subsequently the reduction of the sum of the grids per thread is done and stored in the global grid contained in the master thread, and finally at the end of the parallel calculation, every master thread will add their grid values to update the master thread of the master node using a MPI reduction method (see <a class="elsevierStyleCrossRef" href="#fig0035">Figure 7</a>).</p><elsevierMultimedia ident="fig0035"></elsevierMultimedia><p id="par0260" class="elsevierStylePara elsevierViewall">It is necessary to mention that the implementation of the code was made with the FORTRAN 2003 specification, using as development tool the Intel Cluster Toolkit version 2013 of Intel Corporation.</p></span></span><span id="sec0025" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0045">Performance experiments</span><p id="par0265" class="elsevierStylePara elsevierViewall">For the synthetic experiment we used a case composed by a cube of 700 × 700 × 50 prisms, with 7 contrasting spheres of variable density (see <a class="elsevierStyleCrossRef" href="#fig0040">Figure 8</a>). The spheres were conformed by 251,946 prisms and an observation grid of 150 × 100 = 15,000 points, to an elevation of 100 m. Therefore, the number of calls to a procedure required, to calculate the vector/tensor component of the gravity are 3,779,190,000; this classifies the experiment into a high-performance computing problem.</p><elsevierMultimedia ident="fig0040"></elsevierMultimedia><p id="par0270" class="elsevierStylePara elsevierViewall">We tested the parallelized code by observation points versus the version by prisms using OpenMP. The first parallel scheme is technically easier to implement because for each one of the prisms the calculation of the cycles corresponding to the tracking of the observation grid is parallelized. The second scheme has a more complex implementation because it requires different space memory allocations. The performance experiments that calculate the components of the gravimetric tensor <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">xx</span></span><span class="elsevierStyleItalic">, G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">zz</span></span><span class="elsevierStyleItalic">, G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">xy</span></span><span class="elsevierStyleItalic">, G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">yz</span></span> using both versions were carried out in the server described below. We did not include the performance analysis for the vectorial components <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">x</span></span>, <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">y</span></span> and <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">z</span></span>, since its behavior is very similar.</p><p id="par0275" class="elsevierStylePara elsevierViewall">The characteristics of the server where the tests took place with OpenMP are as follows:<ul class="elsevierStyleList" id="lis0005"><li class="elsevierStyleListItem" id="lsti0005"><span class="elsevierStyleLabel">•</span><p id="par0280" class="elsevierStylePara elsevierViewall">4 Xeon Intel (R) Xeon (R) E7-4850 Processors</p></li><li class="elsevierStyleListItem" id="lsti0010"><span class="elsevierStyleLabel">•</span><p id="par0285" class="elsevierStylePara elsevierViewall">10 processing cores per processor</p></li><li class="elsevierStyleListItem" id="lsti0015"><span class="elsevierStyleLabel">•</span><p id="par0290" class="elsevierStylePara elsevierViewall">Hyperthreading Technology deactivated</p></li><li class="elsevierStyleListItem" id="lsti0020"><span class="elsevierStyleLabel">•</span><p id="par0295" class="elsevierStylePara elsevierViewall">512 GB of RAM memory</p></li><li class="elsevierStyleListItem" id="lsti0025"><span class="elsevierStyleLabel">•</span><p id="par0300" class="elsevierStylePara elsevierViewall">Red Hat 6.3 as operating system</p></li></ul></p><p id="par0305" class="elsevierStylePara elsevierViewall">To interfere as least as possible with the processes of the operating system, we used 35 of the 40 cores available in the server. Initially we can say that the prisms implementation and with independent memory per core was 3.22X faster than its counterpart of observation points. Therefore, while the observation points version uses 757 s, the version partitioned by prisms only consumes 235 s.</p><p id="par0310" class="elsevierStylePara elsevierViewall">The comparison of the computing times per thread in the partition by prisms against the partition by observation points is shown in <a class="elsevierStyleCrossRef" href="#fig0045">Figure 9</a>.</p><elsevierMultimedia ident="fig0045"></elsevierMultimedia><p id="par0315" class="elsevierStylePara elsevierViewall">In <a class="elsevierStyleCrossRef" href="#fig0045">Figure 9</a> it can be seen that the performance behavior is kept stable in both types of partitioning; however, by prisms the best reduction in time is obtained. To prove that the partitioning by prisms keeps reduction time practically linear, we graphed the <span class="elsevierStyleItalic">speedup</span> of the performance by prisms.</p><p id="par0320" class="elsevierStylePara elsevierViewall">For the <span class="elsevierStyleItalic">speed-up</span> shown in <a class="elsevierStyleCrossRef" href="#fig0050">Figure 10</a>, we considered a serial fraction of 5% (<span class="elsevierStyleItalic">f</span> = 0.05). In this fraction the necessary reductions to sum the grid points for each core are contemplated, the total result of the anomaly is calculated as:<elsevierMultimedia ident="eq0055"></elsevierMultimedia></p><elsevierMultimedia ident="fig0050"></elsevierMultimedia><p id="par0325" class="elsevierStylePara elsevierViewall">where, for each (<span class="elsevierStyleItalic">i, j</span>) <span class="elsevierStyleItalic">O</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">f</span></span> is the final observation, <span class="elsevierStyleItalic">O</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">t</span></span> is the calculated grid by core t and <span class="elsevierStyleItalic">Nt</span> is the total number of cores. Therefore, we considered that 95% of the code is parallel, and according to Gustafson's law, the maximum textitspeedup that can be obtained with 35 processing units, in this case cores, is 35 + (1-35) × (0.05) = 33.30. The experimentally obtained <span class="elsevierStyleItalic">speed-up</span> result was 31.31, which represents an absolute difference of 1.99 and a relative difference of 0.06, which shows the efficiency of the implementation.</p><p id="par0330" class="elsevierStylePara elsevierViewall">Another indicator which must be contemplated is the efficiency <span class="elsevierStyleItalic">E</span>, defined as:<elsevierMultimedia ident="eq0060"></elsevierMultimedia></p><p id="par0335" class="elsevierStylePara elsevierViewall">where <span class="elsevierStyleItalic">S</span>(<span class="elsevierStyleItalic">n</span>) is the obtained <span class="elsevierStyleItalic">speed-up</span> with <span class="elsevierStyleItalic">n</span> tasks, and indicates how busy the processors or cores are during execution. <a class="elsevierStyleCrossRef" href="#fig0055">Figure 11</a> shows that the efficiency by prisms is high since on average every processing core is kept busy 94% of the time. The efficiency <span class="elsevierStyleItalic">E</span> also indicates that the partitioning by prisms is scalable, which means that we can increase the number of processors to improve time reduction while not losing efficiency in the use of many cores. The scalability must be contemplated as a good design of the parallel program since it allows scaling the algorithm, so we could expect when the number of processing units is increasing the performance is not affected.</p><elsevierMultimedia ident="fig0055"></elsevierMultimedia><p id="par0340" class="elsevierStylePara elsevierViewall">The design using OpenMP is limited to architectures of machines of shared memory, therefore we are now making experiments using a hybrid machine commonly known as <span class="elsevierStyleItalic">cluster</span>, mixing OpenMP+MPI with the methodology described in subsection 2.2.</p><p id="par0345" class="elsevierStylePara elsevierViewall">The characteristics of the cluster where the numerical experiments were carried out are as follows:<ul class="elsevierStyleList" id="lis0010"><li class="elsevierStyleListItem" id="lsti0030"><span class="elsevierStyleLabel">•</span><p id="par0350" class="elsevierStylePara elsevierViewall">Node: Intel(R) Xeon(R) model X5550 processors with four physical cores processor.</p></li><li class="elsevierStyleListItem" id="lsti0035"><span class="elsevierStyleLabel">•</span><p id="par0355" class="elsevierStylePara elsevierViewall">44 processing nodes</p></li><li class="elsevierStyleListItem" id="lsti0040"><span class="elsevierStyleLabel">•</span><p id="par0360" class="elsevierStylePara elsevierViewall">Hyperthreading Technology enabled</p></li><li class="elsevierStyleListItem" id="lsti0045"><span class="elsevierStyleLabel">•</span><p id="par0365" class="elsevierStylePara elsevierViewall">40 GB of RAM memory per node</p></li><li class="elsevierStyleListItem" id="lsti0050"><span class="elsevierStyleLabel">•</span><p id="par0370" class="elsevierStylePara elsevierViewall">Red Hat 6.3 as operating system</p></li><li class="elsevierStyleListItem" id="lsti0055"><span class="elsevierStyleLabel">•</span><p id="par0375" class="elsevierStylePara elsevierViewall">InfiniBand 300Gbps</p></li></ul></p><p id="par0380" class="elsevierStylePara elsevierViewall">We started by evaluating the performance of each cluster node, as opposed to the experiments done with the 40 cores server, where hyperthreading technology (HT) was disabled. In this case HT is enabled, so each node reports the handling of 8 execution threads instead of 4, but we only have 4 physical floating point units (FPUs). Since our program is computationally intensive, we have to find out if we benefit from the use of HT; some studies have reported the use of HT in numerical applications can modify the performance by 30% (<a class="elsevierStyleCrossRef" href="#bib0035">Curtis-Maury <span class="elsevierStyleItalic">et al</span>., 2008</a>).</p><p id="par0385" class="elsevierStylePara elsevierViewall">The behavior obtained using one node containing 1 processor with four real cores with HT enabled/disabled can be exposed by an analysis of the computing time graph, shown in <a class="elsevierStyleCrossRef" href="#fig0060">Figure 12</a>, the problems analyzed is setup with 13,997 prism conforming a sphere with a mesh of 150 × 100 observations points.</p><elsevierMultimedia ident="fig0060"></elsevierMultimedia><p id="par0390" class="elsevierStylePara elsevierViewall">As can be observed, the best run-time performance that we can obtain from the processor in HT mode is not produced with 4 execution threads, the best performance is obtained with 8 threads, but the time is not doubly improved. This occurs since two threads share the same FPU and the HT technology is designed to quickly switch between threads, and therefore there is not a double improvement in time but the performance gain is approximately 30%, which means that the two threads make better use of the FPU,therefore is necessary to create two threads per core to obtain the maximum performance when the HT is enabled. When the HT is disabled we have an asymptotic behavior after 4 threads but did not reach the performance obtained using the HT mode.</p><p id="par0395" class="elsevierStylePara elsevierViewall">In <a class="elsevierStyleCrossRef" href="#fig0065">Figure 13</a> it can be observed that when HT technology is enabled we obtain a linear <span class="elsevierStyleItalic">speed-up</span> up to 4 execution threads; this is obvious since there are only 4 physical FPUs. Nevertheless, with the HT we can have a better use of the FPUs improving the <span class="elsevierStyleItalic">speed-up</span> up to 5.60, this is, 1.6 more processing units. With the HT disabled, a similar performance is observed up to 4 threads, although this performance is below the one with the HT enabled. For more than 4 threads, the performance with the HT disabled begins to decrease.</p><elsevierMultimedia ident="fig0065"></elsevierMultimedia><p id="par0400" class="elsevierStylePara elsevierViewall">The efficiency corresponding to the <span class="elsevierStyleItalic">speedup</span> shown in <a class="elsevierStyleCrossRef" href="#fig0065">Figure 13</a> is graphed in <a class="elsevierStyleCrossRef" href="#fig0070">Figure 14</a>; notice how HT is able to increase the efficiency of some intensive floating point applications up to 30% when the number of threads equals the number of physical cores. Of course, the best efficiency is obtained with 4 threads because we have 4 FPUs, nevertheless we can get a better performance creating 4 threads more using the additional virtual processors created by the HT.</p><elsevierMultimedia ident="fig0070"></elsevierMultimedia><p id="par0405" class="elsevierStylePara elsevierViewall">To analyze the performance in a node with the original problem (shown in <a class="elsevierStyleCrossRef" href="#fig0040">Figure 8</a>), we added a processor in the second socket to one of the nodes. In other words, we created a node with eight real cores to compare it against a node with four real cores with HT enabled. The results of execution time are shown in <a class="elsevierStyleCrossRef" href="#fig0075">Figure 15</a>.</p><elsevierMultimedia ident="fig0075"></elsevierMultimedia><p id="par0410" class="elsevierStylePara elsevierViewall">It must be taken in consideration our cluster nodes are composed of a single processor with HT enabled, we only added another processor in the second socket to a node for experimental purposes. To have a better perspective of the performance, we determined the <span class="elsevierStyleItalic">speed-up</span> through both node configurations we showed in <a class="elsevierStyleCrossRef" href="#fig0080">Figure 16</a>. A nearly perfect <span class="elsevierStyleItalic">speed-up</span> can be observed for the node with 8 real cores, but a increase of 1.8 processing units for the node with 4 real cores with HT enabled. Evidently, if we enable HT in the machine with 8 real cores we would have 16 reported processors, and to get its maximum performance we would have to create 16 threads. However, the experimentation with 8 real cores was only for comparisonpurposes, since the cluster configuration is made of one node with 4 real cores with HT enabled. It can also be observed that each node of the cluster reduces the time by a factor of 5,8X against the serial version.</p><elsevierMultimedia ident="fig0080"></elsevierMultimedia><p id="par0415" class="elsevierStylePara elsevierViewall">Once it is known that the best node performance is achieved with 8 execution threads for a node with 4 real cores with HT enabled and with the partition by prisms, we can consider each node as a processing unit and distribute the computing with MPI, obtaining a code with a hybrid programming model.</p><p id="par0420" class="elsevierStylePara elsevierViewall">The <span class="elsevierStyleItalic">speed-up</span> results using 25 cluster nodes are displayed in <a class="elsevierStyleCrossRef" href="#fig0085">Figure 17</a>; a serial fraction of 5% (<span class="elsevierStyleItalic">f</span> = 0.05) is considered since in MPI there needs to be reductions in the sum for each node. The results show that a nearly perfect <span class="elsevierStyleItalic">speed-up</span> is obtained up to 22 nodes. From this point on, the speed-up starts declining because the application performance is affected by the communication time between nodes. In other words, the granularity of the tasks begins to decrease for this problem of 249,946 prisms for 30 nodes. This implies that by increasing the granularity of the problem (increasing the number of prisms), the <span class="elsevierStyleItalic">speed-up</span> is also increased until it becomes stable, to decrease again later on.</p><elsevierMultimedia ident="fig0085"></elsevierMultimedia><p id="par0425" class="elsevierStylePara elsevierViewall">The efficiency graph related with the <span class="elsevierStyleItalic">speedup</span> of <a class="elsevierStyleCrossRef" href="#fig0080">Figure 16</a> is shown in <a class="elsevierStyleCrossRef" href="#fig0090">Figure 18</a>. Notice how the efficiency is below 90% after node 23. If we consider that we have an increase in speed 5.8 times per node (from <a class="elsevierStyleCrossRef" href="#fig0075">Figure 15</a>) with respect to the serial version, then the optimum speed factor for this cluster (for a problem of 251,946 prisms) is approximately 5.8 × 22 = 127.6X, i.e. 127 times faster than the serial version. Obviously, as previously stated, if we increase the granularity (number of prisms), the efficiency increases as well. In fact, we reduce the computation time of the spheres problem from 1h 34 m 56 s to 34 s</p><elsevierMultimedia ident="fig0090"></elsevierMultimedia><span id="sec0030" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0050">Comparison with similar programs</span><p id="par0430" class="elsevierStylePara elsevierViewall">To provide a better perspective of the obtained performance with the parallel implementation of our code, we compared against an open source code called <span class="elsevierStyleItalic">tesseroids</span> (<a class="elsevierStyleCrossRef" href="#bib0095">Uieda <span class="elsevierStyleItalic">et al</span>., 2011</a>), which can be downloaded from/dx.doi.org/10.6084/m9.figshare.786514. We chose the problem of 13,997 prisms which form an sphere against 10,000 observation points, since <span class="elsevierStyleItalic">tesseroids</span> is not distributed (can not be executed on a cluster) and can only accelerate the computation in shared memory machines. The execution times are shown using the bar chart in <a class="elsevierStyleCrossRef" href="#fig0095">Figure 19</a>, where it can be observed that with HT disabled we have a speed improvement of 2.14X and with HT enabled of 2.51X with respect to <span class="elsevierStyleItalic">tesseroids</span>. This performance improvement is due to our program design takes a better advantage of the processor technology and keeps the cores occupied to the maximum by using a prisms parallelization scheme based on different memory allocations. This can be observed in the CPU history graph shown in the <a class="elsevierStyleCrossRef" href="#fig0100">Figure 20</a>.</p><elsevierMultimedia ident="fig0095"></elsevierMultimedia><elsevierMultimedia ident="fig0100"></elsevierMultimedia></span></span><span id="sec0035" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0055">Numerical code validation</span><p id="par0435" class="elsevierStylePara elsevierViewall">The main challenge of the parallel programming is to decompose the program into components which can be simultaneously executed to reduce computing time. The decomposition level is highly influenced by the type of architecture of the parallel machine. In this case the design was made with a hybrid programming strategy to get the maximum out of the architecture. Although the reduction of the execution time is the main objective of the parallel programming, the validation of the code is a topic that should be covered since inherent parallelism programming errors can occur.</p><p id="par0440" class="elsevierStylePara elsevierViewall">To measure the error, we compared the previously validated sequential counterpart in the synthetic experiment with the analytical solution. We used the L2 norm error or RMS (<a class="elsevierStyleCrossRef" href="#bib0070">Mickus and Hinojosa, 2001</a>; Menke, 1989), defined as:<elsevierMultimedia ident="eq0065"></elsevierMultimedia></p><p id="par0445" class="elsevierStylePara elsevierViewall">where gi,jp is the tensor component, parallely computed, and gi,js is the serially calculated component.</p><p id="par0450" class="elsevierStylePara elsevierViewall">In <a class="elsevierStyleCrossRef" href="#tbl0005">Table 1</a> the errors of the gravimetric tensor components are shown, parallely calculated with respect to the serial form.</p><elsevierMultimedia ident="tbl0005"></elsevierMultimedia><p id="par0455" class="elsevierStylePara elsevierViewall">From the errors obtained it can be noticed that there is no numerical difference, therefore the parallel version is correctly implemented.</p><p id="par0470" class="elsevierStylePara elsevierViewall">The surface graphs of the gravitational fields are shown in <a class="elsevierStyleCrossRef" href="#fig0105">Figure 21</a>. These graphs correspond to the components of the gravimetric tensor, calculated for the synthetic case studied in <a class="elsevierStyleCrossRef" href="#fig0040">Figure 8</a>.</p><elsevierMultimedia ident="fig0105"></elsevierMultimedia></span><span id="sec0040" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0060">Conclusions</span><p id="par0475" class="elsevierStylePara elsevierViewall">A parallel design for the calculation of the vectorial and tensorial components of the gravity anomaly was implemented and validated using a hybrid methodology with OpenMP and MPI. The numerical experiments and the obtained indicators validate that the implementation is very efficient and that it also yields good results with respect to the numerical solution.</p><p id="par0480" class="elsevierStylePara elsevierViewall">We show that using the simplest or most trivial parallelization form does not contribute to the attainment of the best performance or the greatest exploitation of the platform. For our case, even though the partitioning by prisms requires a greater investment in the design and implementation, it was the most advantageous with respect to performance.</p><p id="par0485" class="elsevierStylePara elsevierViewall">The HT technology could improve some numerical intensive applications up to 30%, nevertheless, to get the best performance it is necessary to create two threads per core when the HT is enabled.</p><p id="par0490" class="elsevierStylePara elsevierViewall">We also conclude that this design can serve as a benchmark for solving problems which require the parallelization of schemes where the decomposition of the domain is not trivial or is shared by the processing units, as is the case of the observation grid. Finally the correct exploitation of OpenMP and MPI, jointly, can become a fundamental tool for parallel programming in clusters.</p></span><span id="sec0045" class="elsevierStyleSection elsevierViewall"><span class="elsevierStyleSectionTitle" id="sect0065">Future work</span><p id="par0495" class="elsevierStylePara elsevierViewall">As future work we pretend to implement the code in CUDA NVIDIA with TESLA technology and compare these results with the cluster performance results presented in this paper, as the measurement of the error introduced by CUDA in single and double precision. The implementation in CUDA is a work of interest since the reduction of the variable values in CUDA technology is very complicated when used in shared form, as is the case with the observation grid.</p></span></span>" "textoCompletoSecciones" => array:1 [ "secciones" => array:13 [ 0 => array:3 [ "identificador" => "xres502921" "titulo" => "Resumen" "secciones" => array:1 [ 0 => array:1 [ "identificador" => "abst0005" ] ] ] 1 => array:2 [ "identificador" => "xpalclavsec524090" "titulo" => "Palabras clave" ] 2 => array:3 [ "identificador" => "xres502920" "titulo" => "Abstract" "secciones" => array:1 [ 0 => array:1 [ "identificador" => "abst0010" ] ] ] 3 => array:2 [ "identificador" => "xpalclavsec524091" "titulo" => "Keywords" ] 4 => array:2 [ "identificador" => "sec0005" "titulo" => "Introduction" ] 5 => array:3 [ "identificador" => "sec0010" "titulo" => "Application design" "secciones" => array:2 [ 0 => array:2 [ "identificador" => "sec0015" "titulo" => "Implementation in OpenMP" ] 1 => array:2 [ "identificador" => "sec0020" "titulo" => "OpenMP+MPI Implementation" ] ] ] 6 => array:3 [ "identificador" => "sec0025" "titulo" => "Performance experiments" "secciones" => array:1 [ 0 => array:2 [ "identificador" => "sec0030" "titulo" => "Comparison with similar programs" ] ] ] 7 => array:2 [ "identificador" => "sec0035" "titulo" => "Numerical code validation" ] 8 => array:2 [ "identificador" => "sec0040" "titulo" => "Conclusions" ] 9 => array:2 [ "identificador" => "sec0045" "titulo" => "Future work" ] 10 => array:2 [ "identificador" => "xack163406" "titulo" => "Acknowledgment" ] 11 => array:1 [ "titulo" => "<span class="elsevierStyleSectionTitle" id="sect0085">Further reading</span>" ] 12 => array:1 [ "titulo" => "References" ] ] ] "pdfFichero" => "main.pdf" "tienePdf" => true "fechaRecibido" => "2013-10-18" "fechaAceptado" => "2014-03-11" "PalabrasClave" => array:2 [ "es" => array:1 [ 0 => array:4 [ "clase" => "keyword" "titulo" => "Palabras clave" "identificador" => "xpalclavsec524090" "palabras" => array:6 [ 0 => "gravedad" 1 => "gradiometría" 2 => "OpenMP" 3 => "MPI" 4 => "hyper-threading" 5 => "clusters." ] ] ] "en" => array:1 [ 0 => array:4 [ "clase" => "keyword" "titulo" => "Keywords" "identificador" => "xpalclavsec524091" "palabras" => array:6 [ 0 => "gravity" 1 => "gradiometry" 2 => "OpenMP" 3 => "MPI" 4 => "hyper-threading" 5 => "clusters." ] ] ] ] "tieneResumen" => true "resumen" => array:2 [ "es" => array:2 [ "titulo" => "Resumen" "resumen" => "<span id="abst0005" class="elsevierStyleSection elsevierViewall"><p id="spar0005" class="elsevierStyleSimplePara elsevierViewall">La solución analítica de las componentes del tensor gravimétrico, utilizando la ecuación del potencial gravitacional para un ensamble volumétrico compuesto de prismas de densidad constante, requiere un alto costo computacional. Esto se debe a que el potencial gravitacional de cada uno de estos prismas tiene que ser calculado para todos los puntos de una malla de observación previamente definida, lo cual resulta en una carga computacional de gran escala. En este trabajo introducimos un diseño híbrido y su implementación paralela basada en OpenMP y MPI, para el cálculo de las componentes vectoriales del campo gravimétrico (<span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">x</span></span>, <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">y</span></span>, <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">z</span></span>) y las componentes del tensor gravimétrico (<span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">xx</span></span><span class="elsevierStyleItalic">, G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">xy</span></span><span class="elsevierStyleItalic">, G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">zz</span></span><span class="elsevierStyleItalic">, G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">yy</span></span><span class="elsevierStyleItalic">, G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">yz</span></span><span class="elsevierStyleItalic">, G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">zz</span></span>).El rendimiento obtenido conlleva a óptimas relaciones del speed-up, ya que el tiempo de cómputo es drásticamente reducido. La técnica de paralelización aplicada consiste en descomponer el problema en grupos de prismas y utilizar diferentes espacios de memoria por núcleo de procesamiento, con el fin de evitar los problemas de cuello de botella cuando se accesa a la memoria compartida de un nodo del cluster, que se producen generalmente cuando varios hilos de ejecución acceden a la misma región en OpenMP. Debido a que OpenMP solo puede utilizarse en sistemas de memoria compartida es necesario utilizar MPI para la distribución del cálculo entre los nodos del cluster, dando como resultado un código híbrido OpenMP+MPI altamente eficiente con un speed-up prácticamente perfecto. Adicionalmente los resultados numéricos fueron validados con respecto a su contraparte secuencial.</p></span>" ] "en" => array:2 [ "titulo" => "Abstract" "resumen" => "<span id="abst0010" class="elsevierStyleSection elsevierViewall"><p id="spar0010" class="elsevierStyleSimplePara elsevierViewall">The analytic solution of the gravimetric tensor components, making use of the gravitational potential equation for a three-dimensional volumetric assembly composed of unit prisms of constant density, demands a high computational cost. This is due to the gravitational potential of each one of these prisms must be calculated for all of the points of a previously defined observation grid, which turns out in a large scale computational cost. In this work we introduce a hybrid design and its parallel implementation, based on OpenMP and MPI, for the calculation of the vectorial components of the gravimetric field and the components of the gravimetric tensor. Since the computing time is drastically reduced, the obtained performance leads close to optimal speed-up ratios. The applied parallelization technique consists of decomposing the problem into groups of prisms and using different memory allocations per processing core to avoid bottleneck issues when accessing the main memory in one cluster node, which are generally produced when using too many execution threads over the same region in OpenMP. Due OpenMP can be only used on shared memory systems is necessary to use MPI for the calculation distribution among cluster nodes, giving as a result a hybrid code (OpenMP+MPI) highly efficient and with a nearly perfect speed-up. Additionally the numerical results were validated with respect to its sequential counterpart.</p></span>" ] ] "apendice" => array:1 [ 0 => array:1 [ "seccion" => array:1 [ 0 => array:4 [ "apendice" => "<p id="par0505" class="elsevierStylePara elsevierViewall">The Earth's gravitational potential <span class="elsevierStyleItalic">G</span> is a scalar quantity, its shape can be constrained by its slope in the <span class="elsevierStyleItalic">x</span>, <span class="elsevierStyleItalic">y</span> and <span class="elsevierStyleItalic">z</span> directions, called the gravitational attraction <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">x</span></span>, <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">y</span></span> and <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">z</span></span> (gravity vector field). In this work, we have investigated how to parallelize the analytical calculation of the components of the gravity field vector and the gravity gradients represented by a nine component tensor, because of the symmetrical or irrotational attribute, the gravity gradient tensor is reduced to only six independent components: <span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf">xx</span>, (the vertical gravity gradient), and For the right rectangular prism model, the analytical formulae for the three components vectors and the six gravity gradient components, corresponding to the Eq. <a class="elsevierStyleCrossRef" href="#eq0010">(2)</a> are given by:<elsevierMultimedia ident="eq0070"></elsevierMultimedia></p>" "etiqueta" => "Appendix A" "titulo" => "Calculation of gravitational quantities" "identificador" => "sec0050" ] ] ] ] "multimedia" => array:36 [ 0 => array:7 [ "identificador" => "fig0005" "etiqueta" => "Figure 1" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr1.jpeg" "Alto" => 1410 "Ancho" => 2029 "Tamanyo" => 295183 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0015" class="elsevierStyleSimplePara elsevierViewall">The domain decomposition based on data for an OpenMP+MPI application.</p>" ] ] 1 => array:7 [ "identificador" => "fig0010" "etiqueta" => "Figure 2" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr2.jpeg" "Alto" => 1086 "Ancho" => 2202 "Tamanyo" => 305857 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0020" class="elsevierStyleSimplePara elsevierViewall">Decomposition of the calculation of M prisms with respect to the observation grid: (a) regular prism assembly, (b) irregular prism assembly.</p>" ] ] 2 => array:7 [ "identificador" => "fig0015" "etiqueta" => "Figure 3" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr3.jpeg" "Alto" => 1155 "Ancho" => 1469 "Tamanyo" => 89657 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0025" class="elsevierStyleSimplePara elsevierViewall">Calculation of a prism with respect to a point of observation.</p>" ] ] 3 => array:7 [ "identificador" => "fig0020" "etiqueta" => "Figure 4" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr4.jpeg" "Alto" => 1155 "Ancho" => 1468 "Tamanyo" => 155282 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0030" class="elsevierStyleSimplePara elsevierViewall">Partitioning by observation points.</p>" ] ] 4 => array:7 [ "identificador" => "fig0025" "etiqueta" => "Figure 5" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr5.jpeg" "Alto" => 1159 "Ancho" => 1468 "Tamanyo" => 196089 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0035" class="elsevierStyleSimplePara elsevierViewall">Parallel region behavior: (a) pseudo-code (1), (b) pseudo-code (2).</p>" ] ] 5 => array:7 [ "identificador" => "fig0030" "etiqueta" => "Figure 6" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr6.jpeg" "Alto" => 2285 "Ancho" => 1186 "Tamanyo" => 319253 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0040" class="elsevierStyleSimplePara elsevierViewall">Partitioning by prisms.</p>" ] ] 6 => array:7 [ "identificador" => "fig0035" "etiqueta" => "Figure 7" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr7.jpeg" "Alto" => 2009 "Ancho" => 1211 "Tamanyo" => 325105 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0045" class="elsevierStyleSimplePara elsevierViewall">OpenMP+MPI design.</p>" ] ] 7 => array:7 [ "identificador" => "fig0040" "etiqueta" => "Figure 8" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr8.jpeg" "Alto" => 1233 "Ancho" => 2770 "Tamanyo" => 186444 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0050" class="elsevierStyleSimplePara elsevierViewall">Synthetic problem setup with 7 spheres of variable density contrast (not scaled). Ensemble size of 22<span class="elsevierStyleHsp" style=""></span>km x 22<span class="elsevierStyleHsp" style=""></span>km x 8<span class="elsevierStyleHsp" style=""></span>km, 251,946 prisms conform the spheres.</p>" ] ] 8 => array:7 [ "identificador" => "fig0045" "etiqueta" => "Figure 9" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr9.jpeg" "Alto" => 1417 "Ancho" => 1995 "Tamanyo" => 344581 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0055" class="elsevierStyleSimplePara elsevierViewall">Comparison between execution time used between the partition by prisms against the partition by observation points (one thread per core).</p>" ] ] 9 => array:7 [ "identificador" => "fig0050" "etiqueta" => "Figure 10" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr10.jpeg" "Alto" => 1418 "Ancho" => 1999 "Tamanyo" => 308978 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0060" class="elsevierStyleSimplePara elsevierViewall"><span class="elsevierStyleItalic">Speed-up</span> of the partitioning by prisms (one thread per core).</p>" ] ] 10 => array:7 [ "identificador" => "fig0055" "etiqueta" => "Figure 11" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr11.jpeg" "Alto" => 1457 "Ancho" => 1915 "Tamanyo" => 339594 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0065" class="elsevierStyleSimplePara elsevierViewall">Efficiency of the partitioning by prisms.</p>" ] ] 11 => array:7 [ "identificador" => "fig0060" "etiqueta" => "Figure 12" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr12.jpeg" "Alto" => 1638 "Ancho" => 2086 "Tamanyo" => 383937 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0070" class="elsevierStyleSimplePara elsevierViewall">Computing time using only one node with HT enabled/disabled, calculating a problem of 13,997 prisms with 10,000 observation points.</p>" ] ] 12 => array:7 [ "identificador" => "fig0065" "etiqueta" => "Figure 13" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr13.jpeg" "Alto" => 1718 "Ancho" => 2074 "Tamanyo" => 568680 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0075" class="elsevierStyleSimplePara elsevierViewall"><span class="elsevierStyleItalic">Speed-up</span> using only one node with HT enabled/disabled corresponding to the execution times shown in the <a class="elsevierStyleCrossRef" href="#fig0055">Figure 11</a>.</p>" ] ] 13 => array:7 [ "identificador" => "fig0070" "etiqueta" => "Figure 14" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr14.jpeg" "Alto" => 1691 "Ancho" => 2159 "Tamanyo" => 562573 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0080" class="elsevierStyleSimplePara elsevierViewall">Efficiency using only one node with 4 cores with the HT enabled/disabled for the problem of 13,997 prisms.</p>" ] ] 14 => array:7 [ "identificador" => "fig0075" "etiqueta" => "Figure 15" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr15.jpeg" "Alto" => 1609 "Ancho" => 2157 "Tamanyo" => 472060 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0085" class="elsevierStyleSimplePara elsevierViewall">Eight real cores with HT disabled vs four real cores with HT enabled for the problem setup in the <a class="elsevierStyleCrossRef" href="#fig0040">Figure 8</a>.</p>" ] ] 15 => array:7 [ "identificador" => "fig0080" "etiqueta" => "Figure 16" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr16.jpeg" "Alto" => 1717 "Ancho" => 2067 "Tamanyo" => 557281 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0090" class="elsevierStyleSimplePara elsevierViewall">Eight real cores with the HT disabled vs four real cores with HT enabled for the problem setup in the <a class="elsevierStyleCrossRef" href="#fig0040">Figure 8</a>.</p>" ] ] 16 => array:7 [ "identificador" => "fig0085" "etiqueta" => "Figure 17" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr17.jpeg" "Alto" => 1713 "Ancho" => 1867 "Tamanyo" => 382116 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0095" class="elsevierStyleSimplePara elsevierViewall"><span class="elsevierStyleItalic">Speed-up</span> obtained using 25 cluster nodes.</p>" ] ] 17 => array:7 [ "identificador" => "fig0090" "etiqueta" => "Figure 18" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr18.jpeg" "Alto" => 1719 "Ancho" => 1869 "Tamanyo" => 439282 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0100" class="elsevierStyleSimplePara elsevierViewall">Cluster eficiency using 25 nodes.</p>" ] ] 18 => array:7 [ "identificador" => "fig0095" "etiqueta" => "Figure 19" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr19.jpeg" "Alto" => 1611 "Ancho" => 2050 "Tamanyo" => 353678 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0105" class="elsevierStyleSimplePara elsevierViewall">Computing time of our implementation vs tesseroids on one node.</p>" ] ] 19 => array:7 [ "identificador" => "fig0100" "etiqueta" => "Figure 20" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr20.jpeg" "Alto" => 3643 "Ancho" => 2220 "Tamanyo" => 898344 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0110" class="elsevierStyleSimplePara elsevierViewall">Behavior of the CPU utilization produced by our implementation vs tesseroids. The HT is enabled and note how the cores are used to maximum efficiency in our implementation compared with the partial use of tesseroids.</p>" ] ] 20 => array:7 [ "identificador" => "fig0105" "etiqueta" => "Figure 21" "tipo" => "MULTIMEDIAFIGURA" "mostrarFloat" => true "mostrarDisplay" => false "figura" => array:1 [ 0 => array:4 [ "imagen" => "gr21.jpeg" "Alto" => 4062 "Ancho" => 3134 "Tamanyo" => 1529960 ] ] "descripcion" => array:1 [ "en" => "<p id="spar0115" class="elsevierStyleSimplePara elsevierViewall">Behavior of the CPU utilization produced by our implementation vs tesseroids. The HT is enabled and note how the cores are used to maximum efficiency in our implementation compared with the partial use of tesseroids.</p>" ] ] 21 => array:7 [ "identificador" => "tbl0005" "etiqueta" => "Table 1" "tipo" => "MULTIMEDIATABLA" "mostrarFloat" => true "mostrarDisplay" => false "tabla" => array:1 [ "tablatextoimagen" => array:1 [ 0 => array:2 [ "tabla" => array:1 [ 0 => """ <table border="0" frame="\n \t\t\t\t\tvoid\n \t\t\t\t" class=""><thead title="thead"><tr title="table-row"><th class="td" title="table-head " align="left" valign="top" scope="col" style="border-bottom: 2px solid black">Gravity gradient tensor components \t\t\t\t\t\t\n \t\t\t\t</th><th class="td" title="table-head " align="left" valign="top" scope="col" style="border-bottom: 2px solid black">Error L2 \t\t\t\t\t\t\n \t\t\t\t</th></tr></thead><tbody title="tbody"><tr title="table-row"><td class="td" title="table-entry " align="left" valign="top"><span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">xx</span></span> \t\t\t\t\t\t\n \t\t\t\t</td><td class="td" title="table-entry " align="left" valign="top">6.3136e-12 \t\t\t\t\t\t\n \t\t\t\t</td></tr><tr title="table-row"><td class="td" title="table-entry " align="left" valign="top"><span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">yy</span></span> \t\t\t\t\t\t\n \t\t\t\t</td><td class="td" title="table-entry " align="left" valign="top">6.3054e-12 \t\t\t\t\t\t\n \t\t\t\t</td></tr><tr title="table-row"><td class="td" title="table-entry " align="left" valign="top"><span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">zz</span></span> \t\t\t\t\t\t\n \t\t\t\t</td><td class="td" title="table-entry " align="left" valign="top">2.8367e-12 \t\t\t\t\t\t\n \t\t\t\t</td></tr><tr title="table-row"><td class="td" title="table-entry " align="left" valign="top"><span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">xy</span></span> \t\t\t\t\t\t\n \t\t\t\t</td><td class="td" title="table-entry " align="left" valign="top">1.0244e-14 \t\t\t\t\t\t\n \t\t\t\t</td></tr><tr title="table-row"><td class="td" title="table-entry " align="left" valign="top"><span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">xz</span></span> \t\t\t\t\t\t\n \t\t\t\t</td><td class="td" title="table-entry " align="left" valign="top">1.5518e-14 \t\t\t\t\t\t\n \t\t\t\t</td></tr><tr title="table-row"><td class="td" title="table-entry " align="left" valign="top"><span class="elsevierStyleItalic">G</span><span class="elsevierStyleInf"><span class="elsevierStyleItalic">yz</span></span> \t\t\t\t\t\t\n \t\t\t\t</td><td class="td" title="table-entry " align="left" valign="top">1.5581e-14 \t\t\t\t\t\t\n \t\t\t\t</td></tr></tbody></table> """ ] "imagenFichero" => array:1 [ 0 => "xTab802338.png" ] ] ] ] "descripcion" => array:1 [ "en" => "<p id="spar0120" class="elsevierStyleSimplePara elsevierViewall">Errors of the tensor components with respect to its sequential counterpart.</p>" ] ] 22 => array:6 [ "identificador" => "eq0005" "etiqueta" => "(1)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx1.jpeg" "Tamanyo" => 29365 "Alto" => 203 "Ancho" => 741 ] ] ] ] 23 => array:6 [ "identificador" => "eq0010" "etiqueta" => "(2)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx2.jpeg" "Tamanyo" => 34831 "Alto" => 94 "Ancho" => 1117 ] ] ] ] 24 => array:6 [ "identificador" => "eq0015" "etiqueta" => "(3)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx3.jpeg" "Tamanyo" => 23140 "Alto" => 85 "Ancho" => 295 ] ] ] ] 25 => array:6 [ "identificador" => "eq0020" "etiqueta" => "(4)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx4.jpeg" "Tamanyo" => 20320 "Alto" => 86 "Ancho" => 499 ] ] ] ] 26 => array:6 [ "identificador" => "eq0025" "etiqueta" => "(5)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx5.jpeg" "Tamanyo" => 17278 "Alto" => 85 "Ancho" => 477 ] ] ] ] 27 => array:6 [ "identificador" => "eq0030" "etiqueta" => "(6)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx6.jpeg" "Tamanyo" => 18075 "Alto" => 85 "Ancho" => 573 ] ] ] ] 28 => array:6 [ "identificador" => "eq0035" "etiqueta" => "(7)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx7.jpeg" "Tamanyo" => 17239 "Alto" => 70 "Ancho" => 493 ] ] ] ] 29 => array:6 [ "identificador" => "eq0040" "etiqueta" => "(8)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx8.jpeg" "Tamanyo" => 25799 "Alto" => 85 "Ancho" => 653 ] ] ] ] 30 => array:6 [ "identificador" => "eq0045" "etiqueta" => "(9)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx9.jpeg" "Tamanyo" => 22741 "Alto" => 71 "Ancho" => 487 ] ] ] ] 31 => array:6 [ "identificador" => "eq0050" "etiqueta" => "(10)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx10.jpeg" "Tamanyo" => 33639 "Alto" => 71 "Ancho" => 449 ] ] ] ] 32 => array:6 [ "identificador" => "eq0055" "etiqueta" => "(11)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx11.jpeg" "Tamanyo" => 27526 "Alto" => 210 "Ancho" => 649 ] ] ] ] 33 => array:6 [ "identificador" => "eq0060" "etiqueta" => "(12)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx12.jpeg" "Tamanyo" => 47687 "Alto" => 177 "Ancho" => 563 ] ] ] ] 34 => array:6 [ "identificador" => "eq0065" "etiqueta" => "(13)" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx13.jpeg" "Tamanyo" => 62778 "Alto" => 261 "Ancho" => 974 ] ] ] ] 35 => array:5 [ "identificador" => "eq0070" "tipo" => "MULTIMEDIAFORMULA" "mostrarFloat" => false "mostrarDisplay" => true "Formula" => array:1 [ "imagen" => array:1 [ 0 => array:4 [ "Fichero" => "fx14.jpeg" "Tamanyo" => 364683 "Alto" => 3213 "Ancho" => 1503 ] ] ] ] ] "bibliografia" => array:2 [ "titulo" => "References" "seccion" => array:1 [ 0 => array:2 [ "identificador" => "bibs0005" "bibliografiaReferencia" => array:20 [ 0 => array:3 [ "identificador" => "bib0005" "etiqueta" => "Amritkar et al., 2012" "referencia" => array:1 [ 0 => array:2 [ "contribucion" => array:1 [ 0 => array:2 [ "titulo" => "OpenMP parallelism for fluid and fluid-particulate systems" "autores" => array:1 [ 0 => array:2 [ "etal" => false "autores" => array:5 [ 0 => "A. Amritkar" 1 => "D. Tafti" 2 => "R. Liu" 3 => "R. Kufrin" 4 => "B. Chapman" ] ] ] ] ] "host" => array:1 [ 0 => array:1 [ "Revista" => array:6 [ "tituloSerie" => "Parallel Computing" "fecha" => "2012" "volumen" => "38" "numero" => "9" "paginaInicial" => "501" "paginaFinal" => "517" ] ] ] ] ] ] 1 => array:3 [ "identificador" => "bib0010" "etiqueta" => "Boehmer et al., 2012" "referencia" => array:1 [ 0 => array:2 [ "contribucion" => array:1 [ 0 => array:2 [ "titulo" => "Numerical simulation of electrical machines by means of a hybrid parallelisation using MPI and OpenMP for finite-element method" "autores" => array:1 [ 0 => array:2 [ "etal" => false "autores" => array:6 [ 0 => "S. Boehmer" 1 => "T. Cramer" 2 => "M. Hafner" 3 => "E. Lange" 4 => "C. Bischof" 5 => "K. Hameyer" ] ] ] ] ] "host" => array:1 [ 0 => array:2 [ "doi" => "10.4103/0970-9290.156794" "Revista" => array:7 [ "tituloSerie" => "Science, Measurement & Technology, IET" "fecha" => "2012" "volumen" => "6" "numero" => "5" "paginaInicial" => "339" "paginaFinal" => "343" "link" => array:1 [ 0 => array:2 [ "url" => "https://www.ncbi.nlm.nih.gov/pubmed/25961612" "web" => "Medline" ] ] ] ] ] ] ] ] 2 => array:3 [ "identificador" => "bib0015" "etiqueta" => "Brunst and Mohr, 2008" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Brunst H., Mohr B., 2008, Performance analysis of large-scale OpenMP and hybrid MPI/OpenMP applications with Vampir NG. In OpenMP Shared Memory Parallel Programming (pp. 5-14). Springer Berlin Heidelberg." ] ] ] 3 => array:3 [ "identificador" => "bib0020" "etiqueta" => "Calvin et al., 2013" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Calvin C., Ye F., Petiton S., 2013, October, The Exploration of Pervasive and Fine-Grained Parallel Model Applied on Intel Xeon Phi Coprocessor. In P2P, Parallel, Grid, Cloud and Internet Computing (3PGCIC), 2013 Eighth International Conference on (pp. 166-173). IEEE." ] ] ] 4 => array:3 [ "identificador" => "bib0025" "etiqueta" => "Carrillo-Ledesma et al., 2013" "referencia" => array:1 [ 0 => array:2 [ "contribucion" => array:1 [ 0 => array:2 [ "titulo" => "Parallel algorithms for computational models of geophysical systems" "autores" => array:1 [ 0 => array:2 [ "etal" => false "autores" => array:3 [ 0 => "A. Carrillo-Ledesma" 1 => "I. Herrera" 2 => "L.M. de la Cruz" ] ] ] ] ] "host" => array:1 [ 0 => array:1 [ "Revista" => array:6 [ "tituloSerie" => "Geofísica Internacional" "fecha" => "2013" "volumen" => "52" "numero" => "3" "paginaInicial" => "293" "paginaFinal" => "309" ] ] ] ] ] ] 5 => array:3 [ "identificador" => "bib0030" "etiqueta" => "Couder-Castañeda, 2009" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Couder-Castañeda C., 2010. Simulation of supersonic flow in an ejector diffuser using the jpvm. Journal of Applied Mathematics, 2009." ] ] ] 6 => array:3 [ "identificador" => "bib0035" "etiqueta" => "Curtis-Maury et al., 2008" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Curtis-Maury M., Ding X., Antonopoulos C.D., Nikolopoulos D.S., 2008, An evaluation of OpenMP on current and emerging multithreaded/multicore processors. In OpenMP Shared Memory Parallel Programming (pp. 133-144). Springer Berlin Heidelberg." ] ] ] 7 => array:3 [ "identificador" => "bib0040" "etiqueta" => "Dagum and Menon, 1998" "referencia" => array:1 [ 0 => array:2 [ "contribucion" => array:1 [ 0 => array:2 [ "titulo" => "OpenMP: an industry standard API for shared-memory programming" "autores" => array:1 [ 0 => array:2 [ "etal" => false "autores" => array:2 [ 0 => "L. Dagum" 1 => "R. Menon" ] ] ] ] ] "host" => array:1 [ 0 => array:2 [ "doi" => "10.1371/journal.pone.0126151" "Revista" => array:7 [ "tituloSerie" => "Computational Science & Engineering, IEEE" "fecha" => "1998" "volumen" => "5" "numero" => "1" "paginaInicial" => "46" "paginaFinal" => "55" "link" => array:1 [ 0 => array:2 [ "url" => "https://www.ncbi.nlm.nih.gov/pubmed/25961860" "web" => "Medline" ] ] ] ] ] ] ] ] 8 => array:3 [ "identificador" => "bib0045" "etiqueta" => "Foster, 1995" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Foster I., 1995, Designing and building parallel programs (pp. 83-135). Addison Wesley Publishing Company." ] ] ] 9 => array:3 [ "identificador" => "bib0050" "etiqueta" => "Gonzalez et al., 2012" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Gonzalez B., Donate J.P., Cortez P., Sánchez G., De Miguel A., 2012, May, Parallelization of an evolving Artificial Neural Networks system to Forecast Time Series using OPENMP and MPI. In Evolving and Adaptive Intelligent Systems (EAIS), 2012 IEEE Conference on (pp. 186-191). IEEE." ] ] ] 10 => array:3 [ "identificador" => "bib0055" "etiqueta" => "Heck and Seitz, 2007" "referencia" => array:1 [ 0 => array:2 [ "contribucion" => array:1 [ 0 => array:2 [ "titulo" => "A comparison of the tesseroid prism and point-mass approaches for mass reductions in gravity field modelling" "autores" => array:1 [ 0 => array:2 [ "etal" => false "autores" => array:2 [ 0 => "B. Heck" 1 => "K. Seitz" ] ] ] ] ] "host" => array:1 [ 0 => array:1 [ "Revista" => array:6 [ "tituloSerie" => "Journal of Geodesy" "fecha" => "2007" "volumen" => "81" "numero" => "2" "paginaInicial" => "121" "paginaFinal" => "136" ] ] ] ] ] ] 11 => array:3 [ "identificador" => "bib0060" "etiqueta" => "Jagannathan and Donzis, 2012" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Jagannathan S., Donzis D.A., 2012, July, Massively parallel direct numerical simulations of forced compressible turbulence: a hybrid MPI/OpenMP approach. In Proceedings of the 1st Conference of the Extreme Science and Engineering Discovery Environment: Bridging from the eXtreme to the campus and beyond (p. 23). ACM." ] ] ] 12 => array:3 [ "identificador" => "bib0065" "etiqueta" => "Krpic et al., 2012" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Krpic Z., Martinovic G., Crnkovic I., 2012, May). Green HPC: MPI vs. OpenMP on a shared memory system. In MIPRO, 2012 Proceedings of the 35th International Convention (pp. 246-250). IEEE." ] ] ] 13 => array:3 [ "identificador" => "bib0070" "etiqueta" => "Mickus and Hinojosa, 2001" "referencia" => array:1 [ 0 => array:2 [ "contribucion" => array:1 [ 0 => array:2 [ "titulo" => "The complete gravity gradient tensor derived from the vertical component of gravity: a Fourier transform technique" "autores" => array:1 [ 0 => array:2 [ "etal" => false "autores" => array:2 [ 0 => "K.L. Mickus" 1 => "J.H. Hinojosa" ] ] ] ] ] "host" => array:1 [ 0 => array:1 [ "Revista" => array:6 [ "tituloSerie" => "Journal of Applied Geophysics" "fecha" => "2001" "volumen" => "46" "numero" => "3" "paginaInicial" => "159" "paginaFinal" => "174" ] ] ] ] ] ] 14 => array:3 [ "identificador" => "bib0075" "etiqueta" => "Mitin et al., 2012" "referencia" => array:1 [ 0 => array:2 [ "contribucion" => array:1 [ 0 => array:2 [ "titulo" => "A parallel iterative solver for positive-definite systems with hybrid MPI–OpenMP parallelization for multi-core clusters" "autores" => array:1 [ 0 => array:2 [ "etal" => false "autores" => array:3 [ 0 => "I. Mitin" 1 => "A. Kalinkin" 2 => "Y. Laevsky" ] ] ] ] ] "host" => array:1 [ 0 => array:1 [ "Revista" => array:6 [ "tituloSerie" => "Journal of Computational Science" "fecha" => "2012" "volumen" => "3" "numero" => "6" "paginaInicial" => "463" "paginaFinal" => "468" ] ] ] ] ] ] 15 => array:3 [ "identificador" => "bib0080" "etiqueta" => "Nagy et al., 2000" "referencia" => array:1 [ 0 => array:2 [ "contribucion" => array:1 [ 0 => array:2 [ "titulo" => "The gravitational potential and its derivatives for the prism" "autores" => array:1 [ 0 => array:2 [ "etal" => false "autores" => array:3 [ 0 => "D. Nagy" 1 => "G. Papp" 2 => "J. Benedek" ] ] ] ] ] "host" => array:1 [ 0 => array:1 [ "Revista" => array:6 [ "tituloSerie" => "Journal of Geodesy" "fecha" => "2000" "volumen" => "74" "numero" => "7–8" "paginaInicial" => "552" "paginaFinal" => "560" ] ] ] ] ] ] 16 => array:3 [ "identificador" => "bib0085" "etiqueta" => "Sariyuce et al., 2012" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Sariyuce A.E., Saule E., Catalyurek U.V., 2012, May, Scalable hybrid implementation of graph coloring using mpi and openmp. In Parallel and Distributed Processing Symposium Workshops & PhD Forum (IPDPSW), 2012 IEEE 26th International (pp. 1744-1753). IEEE." ] ] ] 17 => array:3 [ "identificador" => "bib0090" "etiqueta" => "Smith, 2000" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Smith L.A., 2000, Mixed mode MPI/OpenMP programming. <span class="elsevierStyleItalic">UK High-End Computing Technology Report</span>, 1-25." ] ] ] 18 => array:3 [ "identificador" => "bib0095" "etiqueta" => "Uieda et al., 2011" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Uieda L., Bomfim E., Braitenberg C., Molina E., 2011, July, Optimal forward calculation method of the Marussi tensor due to a geologic structure at GOCE height. In Proceedings of GOCE User Workshop 2011." ] ] ] 19 => array:3 [ "identificador" => "bib0100" "etiqueta" => "Zhang et al., 2004" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Zhang Y., Burcea M., Cheng V., Ho R., Voss M., 2004, September, An Adaptive OpenMP Loop Scheduler for Hyperthreaded SMPs. In ISCA PDCS (pp. 256-263)." ] ] ] ] ] ] ] "lecturaRecomendada" => array:1 [ 0 => array:3 [ "vista" => "all" "titulo" => "<span class="elsevierStyleSectionTitle" id="sect0085">Further reading</span>" "seccion" => array:1 [ 0 => array:2 [ "vista" => "all" "bibliografiaReferencia" => array:2 [ 0 => array:3 [ "identificador" => "bib0105" "etiqueta" => "Kornyei, 2012" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Kornyei L., 2012, May, Parallel implementation of a combustion chamber simulation with MPI-OpenMP hybrid techniques. In MIPRO, 2012 Proceedings of the 35th International Convention (pp. 356-361). IEEE." ] ] ] 1 => array:3 [ "identificador" => "bib0110" "etiqueta" => "Menke, 2012" "referencia" => array:1 [ 0 => array:1 [ "referenciaCompleta" => "Menke W., 2012, Geophysical data analysis: discrete inverse theory. Academic press." ] ] ] ] ] ] ] ] "agradecimientos" => array:1 [ 0 => array:4 [ "identificador" => "xack163406" "titulo" => "Acknowledgment" "texto" => "<p id="par0500" class="elsevierStylePara elsevierViewall">The authors thank the support provided by the Mexican Institute of Petroleum (IMP, <span class="elsevierStyleInterRef" id="intr0005" href="http://www.imp.mx/">www.imp.mx</span>) in allowing access to its computing equipment, as well as the financial support through project Y.00107, jointly created by IMP-SENER-CONACYT number 128376. Also, we would like to express our gratitude to the two anonymous reviewers for their helpful comments.</p>" "vista" => "all" ] ] ] "idiomaDefecto" => "en" "url" => "/00167169/0000005400000001/v1_201505130244/S0016716915000033/v1_201505130244/en/main.assets" "Apartado" => array:4 [ "identificador" => "40021" "tipo" => "SECCION" "en" => array:2 [ "titulo" => "Articles" "idiomaDefecto" => true ] "idiomaDefecto" => "en" ] "PDF" => "https://static.elsevier.es/multimedia/00167169/0000005400000001/v1_201505130244/S0016716915000033/v1_201505130244/en/main.pdf?idApp=UINPBA00004N&text.app=https://www.elsevier.es/" "EPUB" => "https://multimedia.elsevier.es/PublicationsMultimediaV1/item/epub/S0016716915000033?idApp=UINPBA00004N" ]
Year/Month | Html | Total | |
---|---|---|---|
2024 November | 3 | 0 | 3 |
2024 October | 9 | 2 | 11 |
2024 September | 17 | 1 | 18 |
2024 August | 13 | 0 | 13 |
2024 July | 13 | 3 | 16 |
2024 June | 9 | 1 | 10 |
2024 May | 9 | 3 | 12 |
2024 April | 8 | 5 | 13 |
2024 March | 19 | 6 | 25 |
2024 February | 20 | 4 | 24 |
2024 January | 13 | 2 | 15 |
2023 December | 16 | 8 | 24 |
2023 November | 17 | 5 | 22 |
2023 October | 27 | 9 | 36 |
2023 September | 27 | 3 | 30 |
2023 August | 19 | 5 | 24 |
2023 July | 11 | 5 | 16 |
2023 June | 11 | 4 | 15 |
2023 May | 31 | 4 | 35 |
2023 April | 22 | 2 | 24 |
2023 March | 26 | 8 | 34 |
2023 February | 17 | 6 | 23 |
2023 January | 19 | 9 | 28 |
2022 December | 20 | 5 | 25 |
2022 November | 16 | 12 | 28 |
2022 October | 15 | 9 | 24 |
2022 September | 12 | 14 | 26 |
2022 August | 24 | 11 | 35 |
2022 July | 14 | 10 | 24 |
2022 June | 9 | 10 | 19 |
2022 May | 22 | 14 | 36 |
2022 April | 11 | 18 | 29 |
2022 March | 16 | 18 | 34 |
2022 February | 13 | 25 | 38 |
2022 January | 35 | 18 | 53 |
2021 December | 17 | 9 | 26 |
2021 November | 15 | 10 | 25 |
2021 October | 26 | 11 | 37 |
2021 September | 10 | 8 | 18 |
2021 August | 32 | 9 | 41 |
2021 July | 18 | 11 | 29 |
2021 June | 8 | 9 | 17 |
2021 May | 19 | 12 | 31 |
2021 April | 45 | 8 | 53 |
2021 March | 23 | 5 | 28 |
2021 February | 11 | 6 | 17 |
2021 January | 15 | 12 | 27 |
2020 December | 14 | 5 | 19 |
2020 November | 20 | 4 | 24 |
2020 October | 19 | 7 | 26 |
2020 September | 14 | 8 | 22 |
2020 August | 18 | 7 | 25 |
2020 July | 12 | 0 | 12 |
2020 June | 11 | 0 | 11 |
2020 May | 15 | 3 | 18 |
2020 April | 17 | 2 | 19 |
2020 March | 12 | 3 | 15 |
2020 February | 24 | 2 | 26 |
2020 January | 7 | 4 | 11 |
2019 December | 15 | 4 | 19 |
2019 November | 16 | 5 | 21 |
2019 October | 14 | 0 | 14 |
2019 September | 10 | 3 | 13 |
2019 August | 13 | 3 | 16 |
2019 July | 17 | 13 | 30 |
2019 June | 37 | 18 | 55 |
2019 May | 114 | 23 | 137 |
2019 April | 67 | 2 | 69 |
2019 March | 6 | 1 | 7 |
2019 February | 15 | 4 | 19 |
2019 January | 8 | 2 | 10 |
2018 December | 15 | 2 | 17 |
2018 November | 17 | 1 | 18 |
2018 October | 17 | 15 | 32 |
2018 September | 12 | 4 | 16 |
2018 August | 8 | 8 | 16 |
2018 July | 7 | 3 | 10 |
2018 June | 6 | 5 | 11 |
2018 May | 4 | 5 | 9 |
2018 April | 7 | 8 | 15 |
2018 March | 6 | 0 | 6 |
2018 February | 4 | 3 | 7 |
2018 January | 8 | 4 | 12 |
2017 December | 9 | 0 | 9 |
2017 November | 7 | 1 | 8 |
2017 October | 12 | 1 | 13 |
2017 September | 8 | 6 | 14 |
2017 August | 6 | 3 | 9 |
2017 July | 18 | 3 | 21 |
2017 June | 17 | 1 | 18 |
2017 May | 13 | 8 | 21 |
2017 April | 8 | 33 | 41 |
2017 March | 8 | 87 | 95 |
2017 February | 21 | 2 | 23 |
2017 January | 24 | 1 | 25 |
2016 December | 22 | 7 | 29 |
2016 November | 22 | 8 | 30 |
2016 October | 14 | 5 | 19 |
2016 September | 20 | 6 | 26 |
2016 August | 32 | 6 | 38 |
2016 July | 13 | 4 | 17 |
2016 June | 18 | 7 | 25 |
2016 May | 14 | 16 | 30 |
2016 April | 20 | 25 | 45 |
2016 March | 20 | 16 | 36 |
2016 February | 49 | 12 | 61 |
2016 January | 20 | 15 | 35 |
2015 December | 16 | 11 | 27 |
2015 November | 16 | 9 | 25 |
2015 October | 14 | 7 | 21 |
2015 September | 19 | 6 | 25 |
2015 August | 30 | 9 | 39 |
2015 July | 45 | 9 | 54 |
2015 June | 18 | 12 | 30 |
2015 May | 7 | 5 | 12 |